external/boringssl: Sync to c4796c92e0aced2342ed5687201aea07189c3bc1.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/040bc4944be97f5d4b44da176f6e801fc804a176..c4796c92e0aced2342ed5687201aea07189c3bc1

Test: CtsLibcoreTestCases Presubmits
Change-Id: If6d911660fbd9c60896527addb277c8225c3d401
diff --git a/BORINGSSL_REVISION b/BORINGSSL_REVISION
index f26983b..af7f21c 100644
--- a/BORINGSSL_REVISION
+++ b/BORINGSSL_REVISION
@@ -1 +1 @@
-040bc4944be97f5d4b44da176f6e801fc804a176
+c4796c92e0aced2342ed5687201aea07189c3bc1
diff --git a/linux-arm/crypto/chacha/chacha-armv4.S b/linux-arm/crypto/chacha/chacha-armv4.S
index 19a4d2c..0784fc7 100644
--- a/linux-arm/crypto/chacha/chacha-armv4.S
+++ b/linux-arm/crypto/chacha/chacha-armv4.S
@@ -1457,7 +1457,7 @@
 	ldrb	r9,[r12],#1		@ read input
 	subs	r11,r11,#1
 	eor	r8,r8,r9
-	strb	r8,[r14],#1		@ store ouput
+	strb	r8,[r14],#1		@ store output
 	bne	.Loop_tail_neon
 
 .Ldone_neon:
diff --git a/linux-x86/crypto/bn/x86-mont.S b/linux-x86/crypto/bn/x86-mont.S
index 1569b2c..e291a88 100644
--- a/linux-x86/crypto/bn/x86-mont.S
+++ b/linux-x86/crypto/bn/x86-mont.S
@@ -17,39 +17,54 @@
 	jl	.L000just_leave
 	leal	20(%esp),%esi
 	leal	24(%esp),%edx
-	movl	%esp,%ebp
 	addl	$2,%edi
 	negl	%edi
-	leal	-32(%esp,%edi,4),%esp
+	leal	-32(%esp,%edi,4),%ebp
 	negl	%edi
-	movl	%esp,%eax
+	movl	%ebp,%eax
 	subl	%edx,%eax
 	andl	$2047,%eax
-	subl	%eax,%esp
-	xorl	%esp,%edx
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
 	andl	$2048,%edx
 	xorl	$2048,%edx
-	subl	%edx,%esp
-	andl	$-64,%esp
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+	jmp	.L002page_walk_done
+.align	16
+.L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	.L001page_walk
+.L002page_walk_done:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
+	movl	12(%esi),%ebp
 	movl	16(%esi),%esi
 	movl	(%esi),%esi
 	movl	%eax,4(%esp)
 	movl	%ebx,8(%esp)
 	movl	%ecx,12(%esp)
-	movl	%edx,16(%esp)
+	movl	%ebp,16(%esp)
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
-	movl	%ebp,24(%esp)
-	call	.L001PIC_me_up
-.L001PIC_me_up:
+	movl	%edx,24(%esp)
+	call	.L003PIC_me_up
+.L003PIC_me_up:
 	popl	%eax
-	leal	OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax
+	leal	OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
 	btl	$26,(%eax)
-	jnc	.L002non_sse2
+	jnc	.L004non_sse2
 	movl	$-1,%eax
 	movd	%eax,%mm7
 	movl	8(%esp),%esi
@@ -73,7 +88,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	16
-.L0031st:
+.L0051st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -88,7 +103,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	.L0031st
+	jl	.L0051st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -102,7 +117,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-.L004outer:
+.L006outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -124,7 +139,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-.L005inner:
+.L007inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -141,7 +156,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	.L005inner
+	jnz	.L007inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -159,11 +174,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	.L004outer
+	jle	.L006outer
 	emms
-	jmp	.L006common_tail
+	jmp	.L008common_tail
 .align	16
-.L002non_sse2:
+.L004non_sse2:
 	movl	8(%esp),%esi
 	leal	1(%ebx),%ebp
 	movl	12(%esp),%edi
@@ -174,12 +189,12 @@
 	leal	4(%edi,%ebx,4),%eax
 	orl	%edx,%ebp
 	movl	(%edi),%edi
-	jz	.L007bn_sqr_mont
+	jz	.L009bn_sqr_mont
 	movl	%eax,28(%esp)
 	movl	(%esi),%eax
 	xorl	%edx,%edx
 .align	16
-.L008mull:
+.L010mull:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	%eax,%ebp
@@ -188,7 +203,7 @@
 	movl	(%esi,%ecx,4),%eax
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L008mull
+	jl	.L010mull
 	movl	%edx,%ebp
 	mull	%edi
 	movl	20(%esp),%edi
@@ -206,9 +221,9 @@
 	movl	4(%esi),%eax
 	adcl	$0,%edx
 	incl	%ecx
-	jmp	.L0092ndmadd
+	jmp	.L0112ndmadd
 .align	16
-.L0101stmadd:
+.L0121stmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -219,7 +234,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L0101stmadd
+	jl	.L0121stmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%eax
@@ -242,7 +257,7 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 .align	16
-.L0092ndmadd:
+.L0112ndmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -253,7 +268,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0092ndmadd
+	jl	.L0112ndmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -269,16 +284,16 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	28(%esp),%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L006common_tail
+	je	.L008common_tail
 	movl	(%ecx),%edi
 	movl	8(%esp),%esi
 	movl	%ecx,12(%esp)
 	xorl	%ecx,%ecx
 	xorl	%edx,%edx
 	movl	(%esi),%eax
-	jmp	.L0101stmadd
+	jmp	.L0121stmadd
 .align	16
-.L007bn_sqr_mont:
+.L009bn_sqr_mont:
 	movl	%ebx,(%esp)
 	movl	%ecx,12(%esp)
 	movl	%edi,%eax
@@ -289,7 +304,7 @@
 	andl	$1,%ebx
 	incl	%ecx
 .align	16
-.L011sqr:
+.L013sqr:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -301,7 +316,7 @@
 	cmpl	(%esp),%ecx
 	movl	%eax,%ebx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	.L011sqr
+	jl	.L013sqr
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -325,7 +340,7 @@
 	movl	4(%esi),%eax
 	movl	$1,%ecx
 .align	16
-.L0123rdmadd:
+.L0143rdmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -344,7 +359,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	.L0123rdmadd
+	jl	.L0143rdmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -360,7 +375,7 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	%ebx,%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	.L006common_tail
+	je	.L008common_tail
 	movl	4(%esi,%ecx,4),%edi
 	leal	1(%ecx),%ecx
 	movl	%edi,%eax
@@ -372,12 +387,12 @@
 	xorl	%ebp,%ebp
 	cmpl	%ebx,%ecx
 	leal	1(%ecx),%ecx
-	je	.L013sqrlast
+	je	.L015sqrlast
 	movl	%edx,%ebx
 	shrl	$1,%edx
 	andl	$1,%ebx
 .align	16
-.L014sqradd:
+.L016sqradd:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -393,13 +408,13 @@
 	cmpl	(%esp),%ecx
 	movl	%ebp,28(%esp,%ecx,4)
 	movl	%eax,%ebx
-	jle	.L014sqradd
+	jle	.L016sqradd
 	movl	%edx,%ebp
 	addl	%edx,%edx
 	shrl	$31,%ebp
 	addl	%ebx,%edx
 	adcl	$0,%ebp
-.L013sqrlast:
+.L015sqrlast:
 	movl	20(%esp),%edi
 	movl	16(%esp),%esi
 	imull	32(%esp),%edi
@@ -414,9 +429,9 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 	movl	4(%esi),%eax
-	jmp	.L0123rdmadd
+	jmp	.L0143rdmadd
 .align	16
-.L006common_tail:
+.L008common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -424,25 +439,26 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	16
-.L015sub:
+.L017sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	.L015sub
+	jge	.L017sub
 	sbbl	$0,%eax
+	andl	%eax,%esi
+	notl	%eax
+	movl	%edi,%ebp
+	andl	%eax,%ebp
+	orl	%ebp,%esi
 .align	16
-.L016copy:
-	movl	(%esi,%ebx,4),%edx
-	movl	(%edi,%ebx,4),%ebp
-	xorl	%ebp,%edx
-	andl	%eax,%edx
-	xorl	%ebp,%edx
-	movl	%ecx,(%esi,%ebx,4)
-	movl	%edx,(%edi,%ebx,4)
+.L018copy:
+	movl	(%esi,%ebx,4),%eax
+	movl	%eax,(%edi,%ebx,4)
+	movl	%ecx,32(%esp,%ebx,4)
 	decl	%ebx
-	jge	.L016copy
+	jge	.L018copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 .L000just_leave:
diff --git a/linux-x86_64/crypto/aes/aes-x86_64.S b/linux-x86_64/crypto/aes/aes-x86_64.S
index 361e84c..ab1168e 100644
--- a/linux-x86_64/crypto/aes/aes-x86_64.S
+++ b/linux-x86_64/crypto/aes/aes-x86_64.S
@@ -332,6 +332,7 @@
 .type	asm_AES_encrypt,@function
 .hidden	asm_AES_encrypt
 asm_AES_encrypt:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -340,7 +341,6 @@
 	pushq	%r15
 
 
-	movq	%rsp,%r10
 	leaq	-63(%rdx),%rcx
 	andq	$-64,%rsp
 	subq	%rsp,%rcx
@@ -350,7 +350,7 @@
 	subq	$32,%rsp
 
 	movq	%rsi,16(%rsp)
-	movq	%r10,24(%rsp)
+	movq	%rax,24(%rsp)
 .Lenc_prologue:
 
 	movq	%rdx,%r15
@@ -382,13 +382,13 @@
 	movl	%ecx,8(%r9)
 	movl	%edx,12(%r9)
 
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lenc_epilogue:
 	.byte	0xf3,0xc3
 .size	asm_AES_encrypt,.-asm_AES_encrypt
@@ -778,6 +778,7 @@
 .type	asm_AES_decrypt,@function
 .hidden	asm_AES_decrypt
 asm_AES_decrypt:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -786,7 +787,6 @@
 	pushq	%r15
 
 
-	movq	%rsp,%r10
 	leaq	-63(%rdx),%rcx
 	andq	$-64,%rsp
 	subq	%rsp,%rcx
@@ -796,7 +796,7 @@
 	subq	$32,%rsp
 
 	movq	%rsi,16(%rsp)
-	movq	%r10,24(%rsp)
+	movq	%rax,24(%rsp)
 .Ldec_prologue:
 
 	movq	%rdx,%r15
@@ -830,13 +830,13 @@
 	movl	%ecx,8(%r9)
 	movl	%edx,12(%r9)
 
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Ldec_epilogue:
 	.byte	0xf3,0xc3
 .size	asm_AES_decrypt,.-asm_AES_decrypt
@@ -1313,10 +1313,9 @@
 	movl	%r9d,%r9d
 
 	leaq	.LAES_Te(%rip),%r14
+	leaq	.LAES_Td(%rip),%r10
 	cmpq	$0,%r9
-	jne	.Lcbc_picked_te
-	leaq	.LAES_Td(%rip),%r14
-.Lcbc_picked_te:
+	cmoveq	%r10,%r14
 
 	movl	OPENSSL_ia32cap_P(%rip),%r10d
 	cmpq	$512,%rdx
diff --git a/linux-x86_64/crypto/aes/aesni-x86_64.S b/linux-x86_64/crypto/aes/aesni-x86_64.S
index 5709a2d..a90e935 100644
--- a/linux-x86_64/crypto/aes/aesni-x86_64.S
+++ b/linux-x86_64/crypto/aes/aesni-x86_64.S
@@ -1032,11 +1032,10 @@
 
 .align	16
 .Lctr32_bulk:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$128,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
 
 
 
@@ -1045,7 +1044,7 @@
 	movdqu	(%rcx),%xmm0
 	movl	12(%r8),%r8d
 	pxor	%xmm0,%xmm2
-	movl	12(%rcx),%r11d
+	movl	12(%rcx),%ebp
 	movdqa	%xmm2,0(%rsp)
 	bswapl	%r8d
 	movdqa	%xmm2,%xmm3
@@ -1061,8 +1060,8 @@
 	leaq	2(%r8),%rdx
 	bswapl	%eax
 	bswapl	%edx
-	xorl	%r11d,%eax
-	xorl	%r11d,%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
 .byte	102,15,58,34,216,3
 	leaq	3(%r8),%rax
 	movdqa	%xmm3,16(%rsp)
@@ -1071,25 +1070,25 @@
 	movq	%r10,%rdx
 	leaq	4(%r8),%r10
 	movdqa	%xmm4,32(%rsp)
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 	bswapl	%r10d
 .byte	102,15,58,34,232,3
-	xorl	%r11d,%r10d
+	xorl	%ebp,%r10d
 	movdqa	%xmm5,48(%rsp)
 	leaq	5(%r8),%r9
 	movl	%r10d,64+12(%rsp)
 	bswapl	%r9d
 	leaq	6(%r8),%r10
 	movl	240(%rcx),%eax
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	bswapl	%r10d
 	movl	%r9d,80+12(%rsp)
-	xorl	%r11d,%r10d
+	xorl	%ebp,%r10d
 	leaq	7(%r8),%r9
 	movl	%r10d,96+12(%rsp)
 	bswapl	%r9d
 	movl	OPENSSL_ia32cap_P+4(%rip),%r10d
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	andl	$71303168,%r10d
 	movl	%r9d,112+12(%rsp)
 
@@ -1113,7 +1112,7 @@
 .Lctr32_6x:
 	shll	$4,%eax
 	movl	$48,%r10d
-	bswapl	%r11d
+	bswapl	%ebp
 	leaq	32(%rcx,%rax,1),%rcx
 	subq	%rax,%r10
 	jmp	.Lctr32_loop6
@@ -1124,32 +1123,32 @@
 	movups	-48(%rcx,%r10,1),%xmm0
 .byte	102,15,56,220,209
 	movl	%r8d,%eax
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	102,15,56,220,217
 .byte	0x0f,0x38,0xf1,0x44,0x24,12
 	leal	1(%r8),%eax
 .byte	102,15,56,220,225
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	0x0f,0x38,0xf1,0x44,0x24,28
 .byte	102,15,56,220,233
 	leal	2(%r8),%eax
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	102,15,56,220,241
 .byte	0x0f,0x38,0xf1,0x44,0x24,44
 	leal	3(%r8),%eax
 .byte	102,15,56,220,249
 	movups	-32(%rcx,%r10,1),%xmm1
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 
 .byte	102,15,56,220,208
 .byte	0x0f,0x38,0xf1,0x44,0x24,60
 	leal	4(%r8),%eax
 .byte	102,15,56,220,216
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	0x0f,0x38,0xf1,0x44,0x24,76
 .byte	102,15,56,220,224
 	leal	5(%r8),%eax
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	102,15,56,220,232
 .byte	0x0f,0x38,0xf1,0x44,0x24,92
 	movq	%r10,%rax
@@ -1210,7 +1209,7 @@
 	bswapl	%r9d
 	movups	32-128(%rcx),%xmm0
 .byte	102,15,56,220,225
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	nop
 .byte	102,15,56,220,233
 	movl	%r9d,0+12(%rsp)
@@ -1223,7 +1222,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
@@ -1237,7 +1236,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
@@ -1251,7 +1250,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
@@ -1265,7 +1264,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
@@ -1279,7 +1278,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
@@ -1293,7 +1292,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
@@ -1308,7 +1307,7 @@
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
 .byte	102,15,56,220,224
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	movdqu	0(%rdi),%xmm10
 .byte	102,15,56,220,232
 	movl	%r9d,112+12(%rsp)
@@ -1543,7 +1542,7 @@
 
 .Lctr32_done:
 	xorps	%xmm0,%xmm0
-	xorl	%r11d,%r11d
+	xorl	%ebp,%ebp
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
 	pxor	%xmm3,%xmm3
@@ -1567,8 +1566,8 @@
 	pxor	%xmm14,%xmm14
 	movaps	%xmm0,112(%rsp)
 	pxor	%xmm15,%xmm15
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 .Lctr32_epilogue:
 	.byte	0xf3,0xc3
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
@@ -1577,11 +1576,10 @@
 .type	aesni_xts_encrypt,@function
 .align	16
 aesni_xts_encrypt:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$112,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
 	movups	(%r9),%xmm2
 	movl	240(%r8),%eax
 	movl	240(%rcx),%r10d
@@ -1597,7 +1595,7 @@
 	jnz	.Loop_enc1_8
 .byte	102,15,56,221,209
 	movups	(%rcx),%xmm0
-	movq	%rcx,%r11
+	movq	%rcx,%rbp
 	movl	%r10d,%eax
 	shll	$4,%r10d
 	movq	%rdx,%r9
@@ -1653,9 +1651,9 @@
 	jc	.Lxts_enc_short
 
 	movl	$16+96,%eax
-	leaq	32(%r11,%r10,1),%rcx
+	leaq	32(%rbp,%r10,1),%rcx
 	subq	%r10,%rax
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 	movq	%rax,%r10
 	leaq	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_enc_grandloop
@@ -1680,7 +1678,7 @@
 	movdqa	96(%rsp),%xmm9
 	pxor	%xmm14,%xmm6
 .byte	102,15,56,220,233
-	movups	32(%r11),%xmm0
+	movups	32(%rbp),%xmm0
 	leaq	96(%rdi),%rdi
 	pxor	%xmm8,%xmm7
 
@@ -1689,7 +1687,7 @@
 	pxor	%xmm9,%xmm11
 	movdqa	%xmm10,0(%rsp)
 .byte	102,15,56,220,249
-	movups	48(%r11),%xmm1
+	movups	48(%rbp),%xmm1
 	pxor	%xmm9,%xmm12
 
 .byte	102,15,56,220,208
@@ -1704,7 +1702,7 @@
 	movdqa	%xmm14,64(%rsp)
 .byte	102,15,56,220,240
 .byte	102,15,56,220,248
-	movups	64(%r11),%xmm0
+	movups	64(%rbp),%xmm0
 	movdqa	%xmm8,80(%rsp)
 	pshufd	$0x5f,%xmm15,%xmm9
 	jmp	.Lxts_enc_loop6
@@ -1736,7 +1734,7 @@
 	psrad	$31,%xmm14
 .byte	102,15,56,220,217
 	pand	%xmm8,%xmm14
-	movups	(%r11),%xmm10
+	movups	(%rbp),%xmm10
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
@@ -1804,10 +1802,10 @@
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
 	pxor	%xmm0,%xmm15
-	movups	(%r11),%xmm0
+	movups	(%rbp),%xmm0
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 
 	pxor	%xmm15,%xmm14
 .byte	102,15,56,221,84,36,0
@@ -1834,7 +1832,7 @@
 
 	movl	$16+96,%eax
 	subl	%r10d,%eax
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	shrl	$4,%eax
 
 .Lxts_enc_short:
@@ -1990,7 +1988,7 @@
 	jnz	.Lxts_enc_steal
 
 	subq	%r9,%rsi
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movl	%r10d,%eax
 
 	movups	-16(%rsi),%xmm2
@@ -2033,8 +2031,8 @@
 	movaps	%xmm0,96(%rsp)
 	pxor	%xmm14,%xmm14
 	pxor	%xmm15,%xmm15
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 .Lxts_enc_epilogue:
 	.byte	0xf3,0xc3
 .size	aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -2043,11 +2041,10 @@
 .type	aesni_xts_decrypt,@function
 .align	16
 aesni_xts_decrypt:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$112,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
 	movups	(%r9),%xmm2
 	movl	240(%r8),%eax
 	movl	240(%rcx),%r10d
@@ -2069,7 +2066,7 @@
 	subq	%rax,%rdx
 
 	movups	(%rcx),%xmm0
-	movq	%rcx,%r11
+	movq	%rcx,%rbp
 	movl	%r10d,%eax
 	shll	$4,%r10d
 	movq	%rdx,%r9
@@ -2125,9 +2122,9 @@
 	jc	.Lxts_dec_short
 
 	movl	$16+96,%eax
-	leaq	32(%r11,%r10,1),%rcx
+	leaq	32(%rbp,%r10,1),%rcx
 	subq	%r10,%rax
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 	movq	%rax,%r10
 	leaq	.Lxts_magic(%rip),%r8
 	jmp	.Lxts_dec_grandloop
@@ -2152,7 +2149,7 @@
 	movdqa	96(%rsp),%xmm9
 	pxor	%xmm14,%xmm6
 .byte	102,15,56,222,233
-	movups	32(%r11),%xmm0
+	movups	32(%rbp),%xmm0
 	leaq	96(%rdi),%rdi
 	pxor	%xmm8,%xmm7
 
@@ -2161,7 +2158,7 @@
 	pxor	%xmm9,%xmm11
 	movdqa	%xmm10,0(%rsp)
 .byte	102,15,56,222,249
-	movups	48(%r11),%xmm1
+	movups	48(%rbp),%xmm1
 	pxor	%xmm9,%xmm12
 
 .byte	102,15,56,222,208
@@ -2176,7 +2173,7 @@
 	movdqa	%xmm14,64(%rsp)
 .byte	102,15,56,222,240
 .byte	102,15,56,222,248
-	movups	64(%r11),%xmm0
+	movups	64(%rbp),%xmm0
 	movdqa	%xmm8,80(%rsp)
 	pshufd	$0x5f,%xmm15,%xmm9
 	jmp	.Lxts_dec_loop6
@@ -2208,7 +2205,7 @@
 	psrad	$31,%xmm14
 .byte	102,15,56,222,217
 	pand	%xmm8,%xmm14
-	movups	(%r11),%xmm10
+	movups	(%rbp),%xmm10
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
@@ -2276,10 +2273,10 @@
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
 	pxor	%xmm0,%xmm15
-	movups	(%r11),%xmm0
+	movups	(%rbp),%xmm0
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 
 	pxor	%xmm15,%xmm14
 .byte	102,15,56,223,84,36,0
@@ -2306,7 +2303,7 @@
 
 	movl	$16+96,%eax
 	subl	%r10d,%eax
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	shrl	$4,%eax
 
 .Lxts_dec_short:
@@ -2463,7 +2460,7 @@
 	jz	.Lxts_dec_ret
 .Lxts_dec_done2:
 	movq	%r9,%rdx
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movl	%r10d,%eax
 
 	movups	(%rdi),%xmm2
@@ -2493,7 +2490,7 @@
 	jnz	.Lxts_dec_steal
 
 	subq	%r9,%rsi
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movl	%r10d,%eax
 
 	movups	(%rsi),%xmm2
@@ -2536,11 +2533,827 @@
 	movaps	%xmm0,96(%rsp)
 	pxor	%xmm14,%xmm14
 	pxor	%xmm15,%xmm15
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 .Lxts_dec_epilogue:
 	.byte	0xf3,0xc3
 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
+.globl	aesni_ocb_encrypt
+.hidden aesni_ocb_encrypt
+.type	aesni_ocb_encrypt,@function
+.align	32
+aesni_ocb_encrypt:
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	movq	8(%rax),%rbx
+	movq	8+8(%rax),%rbp
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	shll	$4,%r10d
+	movups	(%rcx),%xmm9
+	movups	16(%rcx,%r10,1),%xmm1
+
+	movdqu	(%r9),%xmm15
+	pxor	%xmm1,%xmm9
+	pxor	%xmm1,%xmm15
+
+	movl	$16+32,%eax
+	leaq	32(%r11,%r10,1),%rcx
+	movups	16(%r11),%xmm1
+	subq	%r10,%rax
+	movq	%rax,%r10
+
+	movdqu	(%rbx),%xmm10
+	movdqu	(%rbp),%xmm8
+
+	testq	$1,%r8
+	jnz	.Locb_enc_odd
+
+	bsfq	%r8,%r12
+	addq	$1,%r8
+	shlq	$4,%r12
+	movdqu	(%rbx,%r12,1),%xmm7
+	movdqu	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+
+	call	__ocb_encrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$1,%rdx
+	jz	.Locb_enc_done
+
+.Locb_enc_odd:
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	leaq	6(%r8),%r8
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+	shlq	$4,%r12
+	shlq	$4,%r13
+	shlq	$4,%r14
+
+	subq	$6,%rdx
+	jc	.Locb_enc_short
+	jmp	.Locb_enc_grandloop
+
+.align	32
+.Locb_enc_grandloop:
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+
+	call	__ocb_encrypt6
+
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$6,%rdx
+	jnc	.Locb_enc_grandloop
+
+.Locb_enc_short:
+	addq	$6,%rdx
+	jz	.Locb_enc_done
+
+	movdqu	0(%rdi),%xmm2
+	cmpq	$2,%rdx
+	jb	.Locb_enc_one
+	movdqu	16(%rdi),%xmm3
+	je	.Locb_enc_two
+
+	movdqu	32(%rdi),%xmm4
+	cmpq	$4,%rdx
+	jb	.Locb_enc_three
+	movdqu	48(%rdi),%xmm5
+	je	.Locb_enc_four
+
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm7,%xmm7
+
+	call	__ocb_encrypt6
+
+	movdqa	%xmm14,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_one:
+	movdqa	%xmm10,%xmm7
+
+	call	__ocb_encrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,0(%rsi)
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_two:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_encrypt4
+
+	movdqa	%xmm11,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_three:
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_encrypt4
+
+	movdqa	%xmm12,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_four:
+	call	__ocb_encrypt4
+
+	movdqa	%xmm13,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+
+.Locb_enc_done:
+	pxor	%xmm0,%xmm15
+	movdqu	%xmm8,(%rbp)
+	movdqu	%xmm15,(%r9)
+
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	leaq	40(%rsp),%rax
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Locb_enc_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type	__ocb_encrypt6,@function
+.align	32
+__ocb_encrypt6:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	movdqa	%xmm10,%xmm14
+	pxor	%xmm15,%xmm10
+	movdqu	(%rbx,%r14,1),%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm2,%xmm8
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm3,%xmm8
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm4,%xmm8
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm14
+	pxor	%xmm5,%xmm8
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm15
+	pxor	%xmm6,%xmm8
+	pxor	%xmm14,%xmm6
+	pxor	%xmm7,%xmm8
+	pxor	%xmm15,%xmm7
+	movups	32(%r11),%xmm0
+
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	addq	$6,%r8
+	pxor	%xmm9,%xmm10
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+.byte	102,15,56,220,241
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm14
+.byte	102,15,56,220,249
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm15
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	64(%r11),%xmm0
+	shlq	$4,%r12
+	shlq	$4,%r13
+	jmp	.Locb_enc_loop6
+
+.align	32
+.Locb_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Locb_enc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	16(%r11),%xmm1
+	shlq	$4,%r14
+
+.byte	102,65,15,56,221,210
+	movdqu	(%rbx),%xmm10
+	movq	%r10,%rax
+.byte	102,65,15,56,221,219
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	.byte	0xf3,0xc3
+.size	__ocb_encrypt6,.-__ocb_encrypt6
+
+.type	__ocb_encrypt4,@function
+.align	32
+__ocb_encrypt4:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	pxor	%xmm15,%xmm10
+	pxor	%xmm10,%xmm11
+	pxor	%xmm2,%xmm8
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm3,%xmm8
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm4,%xmm8
+	pxor	%xmm12,%xmm4
+	pxor	%xmm5,%xmm8
+	pxor	%xmm13,%xmm5
+	movups	32(%r11),%xmm0
+
+	pxor	%xmm9,%xmm10
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+	pxor	%xmm9,%xmm13
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	48(%r11),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	64(%r11),%xmm0
+	jmp	.Locb_enc_loop4
+
+.align	32
+.Locb_enc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Locb_enc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,65,15,56,221,210
+.byte	102,65,15,56,221,219
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	.byte	0xf3,0xc3
+.size	__ocb_encrypt4,.-__ocb_encrypt4
+
+.type	__ocb_encrypt1,@function
+.align	32
+__ocb_encrypt1:
+	pxor	%xmm15,%xmm7
+	pxor	%xmm9,%xmm7
+	pxor	%xmm2,%xmm8
+	pxor	%xmm7,%xmm2
+	movups	32(%r11),%xmm0
+
+.byte	102,15,56,220,209
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm7
+
+.byte	102,15,56,220,208
+	movups	64(%r11),%xmm0
+	jmp	.Locb_enc_loop1
+
+.align	32
+.Locb_enc_loop1:
+.byte	102,15,56,220,209
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,220,208
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Locb_enc_loop1
+
+.byte	102,15,56,220,209
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,15,56,221,215
+	.byte	0xf3,0xc3
+.size	__ocb_encrypt1,.-__ocb_encrypt1
+
+.globl	aesni_ocb_decrypt
+.hidden aesni_ocb_decrypt
+.type	aesni_ocb_decrypt,@function
+.align	32
+aesni_ocb_decrypt:
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	movq	8(%rax),%rbx
+	movq	8+8(%rax),%rbp
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	shll	$4,%r10d
+	movups	(%rcx),%xmm9
+	movups	16(%rcx,%r10,1),%xmm1
+
+	movdqu	(%r9),%xmm15
+	pxor	%xmm1,%xmm9
+	pxor	%xmm1,%xmm15
+
+	movl	$16+32,%eax
+	leaq	32(%r11,%r10,1),%rcx
+	movups	16(%r11),%xmm1
+	subq	%r10,%rax
+	movq	%rax,%r10
+
+	movdqu	(%rbx),%xmm10
+	movdqu	(%rbp),%xmm8
+
+	testq	$1,%r8
+	jnz	.Locb_dec_odd
+
+	bsfq	%r8,%r12
+	addq	$1,%r8
+	shlq	$4,%r12
+	movdqu	(%rbx,%r12,1),%xmm7
+	movdqu	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+
+	call	__ocb_decrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm8
+	leaq	16(%rsi),%rsi
+	subq	$1,%rdx
+	jz	.Locb_dec_done
+
+.Locb_dec_odd:
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	leaq	6(%r8),%r8
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+	shlq	$4,%r12
+	shlq	$4,%r13
+	shlq	$4,%r14
+
+	subq	$6,%rdx
+	jc	.Locb_dec_short
+	jmp	.Locb_dec_grandloop
+
+.align	32
+.Locb_dec_grandloop:
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+
+	call	__ocb_decrypt6
+
+	movups	%xmm2,0(%rsi)
+	pxor	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm8
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm8
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm8
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm8
+	leaq	96(%rsi),%rsi
+	subq	$6,%rdx
+	jnc	.Locb_dec_grandloop
+
+.Locb_dec_short:
+	addq	$6,%rdx
+	jz	.Locb_dec_done
+
+	movdqu	0(%rdi),%xmm2
+	cmpq	$2,%rdx
+	jb	.Locb_dec_one
+	movdqu	16(%rdi),%xmm3
+	je	.Locb_dec_two
+
+	movdqu	32(%rdi),%xmm4
+	cmpq	$4,%rdx
+	jb	.Locb_dec_three
+	movdqu	48(%rdi),%xmm5
+	je	.Locb_dec_four
+
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm7,%xmm7
+
+	call	__ocb_decrypt6
+
+	movdqa	%xmm14,%xmm15
+	movups	%xmm2,0(%rsi)
+	pxor	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm8
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm8
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm8
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_one:
+	movdqa	%xmm10,%xmm7
+
+	call	__ocb_decrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,0(%rsi)
+	xorps	%xmm2,%xmm8
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_two:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_decrypt4
+
+	movdqa	%xmm11,%xmm15
+	movups	%xmm2,0(%rsi)
+	xorps	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm3,%xmm8
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_three:
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_decrypt4
+
+	movdqa	%xmm12,%xmm15
+	movups	%xmm2,0(%rsi)
+	xorps	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	xorps	%xmm4,%xmm8
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_four:
+	call	__ocb_decrypt4
+
+	movdqa	%xmm13,%xmm15
+	movups	%xmm2,0(%rsi)
+	pxor	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm8
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm8
+
+.Locb_dec_done:
+	pxor	%xmm0,%xmm15
+	movdqu	%xmm8,(%rbp)
+	movdqu	%xmm15,(%r9)
+
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	leaq	40(%rsp),%rax
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+.Locb_dec_epilogue:
+	.byte	0xf3,0xc3
+.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type	__ocb_decrypt6,@function
+.align	32
+__ocb_decrypt6:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	movdqa	%xmm10,%xmm14
+	pxor	%xmm15,%xmm10
+	movdqu	(%rbx,%r14,1),%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm14
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm15
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+	movups	32(%r11),%xmm0
+
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	addq	$6,%r8
+	pxor	%xmm9,%xmm10
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+.byte	102,15,56,222,241
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm14
+.byte	102,15,56,222,249
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm15
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	64(%r11),%xmm0
+	shlq	$4,%r12
+	shlq	$4,%r13
+	jmp	.Locb_dec_loop6
+
+.align	32
+.Locb_dec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Locb_dec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	16(%r11),%xmm1
+	shlq	$4,%r14
+
+.byte	102,65,15,56,223,210
+	movdqu	(%rbx),%xmm10
+	movq	%r10,%rax
+.byte	102,65,15,56,223,219
+.byte	102,65,15,56,223,228
+.byte	102,65,15,56,223,237
+.byte	102,65,15,56,223,246
+.byte	102,65,15,56,223,255
+	.byte	0xf3,0xc3
+.size	__ocb_decrypt6,.-__ocb_decrypt6
+
+.type	__ocb_decrypt4,@function
+.align	32
+__ocb_decrypt4:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	pxor	%xmm15,%xmm10
+	pxor	%xmm10,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	movups	32(%r11),%xmm0
+
+	pxor	%xmm9,%xmm10
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+	pxor	%xmm9,%xmm13
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	48(%r11),%xmm1
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	64(%r11),%xmm0
+	jmp	.Locb_dec_loop4
+
+.align	32
+.Locb_dec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Locb_dec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,65,15,56,223,210
+.byte	102,65,15,56,223,219
+.byte	102,65,15,56,223,228
+.byte	102,65,15,56,223,237
+	.byte	0xf3,0xc3
+.size	__ocb_decrypt4,.-__ocb_decrypt4
+
+.type	__ocb_decrypt1,@function
+.align	32
+__ocb_decrypt1:
+	pxor	%xmm15,%xmm7
+	pxor	%xmm9,%xmm7
+	pxor	%xmm7,%xmm2
+	movups	32(%r11),%xmm0
+
+.byte	102,15,56,222,209
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm7
+
+.byte	102,15,56,222,208
+	movups	64(%r11),%xmm0
+	jmp	.Locb_dec_loop1
+
+.align	32
+.Locb_dec_loop1:
+.byte	102,15,56,222,209
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,222,208
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	.Locb_dec_loop1
+
+.byte	102,15,56,222,209
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,15,56,223,215
+	.byte	0xf3,0xc3
+.size	__ocb_decrypt1,.-__ocb_decrypt1
 .globl	aesni_cbc_encrypt
 .hidden aesni_cbc_encrypt
 .type	aesni_cbc_encrypt,@function
@@ -2638,11 +3451,11 @@
 	jmp	.Lcbc_ret
 .align	16
 .Lcbc_decrypt_bulk:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$16,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
+	movq	%rcx,%rbp
 	movups	(%r8),%xmm10
 	movl	%r10d,%eax
 	cmpq	$0x50,%rdx
@@ -2682,7 +3495,7 @@
 	pxor	%xmm0,%xmm3
 	movups	16-112(%rcx),%xmm1
 	pxor	%xmm0,%xmm4
-	xorq	%r11,%r11
+	movq	$-1,%rbp
 	cmpq	$0x70,%rdx
 	pxor	%xmm0,%xmm5
 	pxor	%xmm0,%xmm6
@@ -2698,10 +3511,10 @@
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
 .byte	102,68,15,56,222,193
-	setnc	%r11b
-	shlq	$7,%r11
+	adcq	$0,%rbp
+	andq	$128,%rbp
 .byte	102,68,15,56,222,201
-	addq	%rdi,%r11
+	addq	%rdi,%rbp
 	movups	48-112(%rcx),%xmm1
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
@@ -2839,18 +3652,18 @@
 	movdqu	112(%rdi),%xmm0
 .byte	102,65,15,56,223,228
 	leaq	128(%rdi),%rdi
-	movdqu	0(%r11),%xmm11
+	movdqu	0(%rbp),%xmm11
 .byte	102,65,15,56,223,237
 .byte	102,65,15,56,223,246
-	movdqu	16(%r11),%xmm12
-	movdqu	32(%r11),%xmm13
+	movdqu	16(%rbp),%xmm12
+	movdqu	32(%rbp),%xmm13
 .byte	102,65,15,56,223,255
 .byte	102,68,15,56,223,193
-	movdqu	48(%r11),%xmm14
-	movdqu	64(%r11),%xmm15
+	movdqu	48(%rbp),%xmm14
+	movdqu	64(%rbp),%xmm15
 .byte	102,69,15,56,223,202
 	movdqa	%xmm0,%xmm10
-	movdqu	80(%r11),%xmm1
+	movdqu	80(%rbp),%xmm1
 	movups	-112(%rcx),%xmm0
 
 	movups	%xmm2,(%rsi)
@@ -2969,7 +3782,7 @@
 	pxor	%xmm13,%xmm5
 	movdqu	%xmm4,32(%rsi)
 	pxor	%xmm14,%xmm6
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movdqu	%xmm5,48(%rsi)
 	pxor	%xmm15,%xmm7
 	movl	%r10d,%eax
@@ -3122,8 +3935,8 @@
 .Lcbc_dec_ret:
 	xorps	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 .Lcbc_ret:
 	.byte	0xf3,0xc3
 .size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
diff --git a/linux-x86_64/crypto/aes/bsaes-x86_64.S b/linux-x86_64/crypto/aes/bsaes-x86_64.S
index c5491ce..3f3c73b 100644
--- a/linux-x86_64/crypto/aes/bsaes-x86_64.S
+++ b/linux-x86_64/crypto/aes/bsaes-x86_64.S
@@ -1305,15 +1305,14 @@
 	cmpq	%rax,%rbp
 	ja	.Lcbc_dec_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 .Lcbc_dec_epilogue:
 	.byte	0xf3,0xc3
 .size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
@@ -1506,15 +1505,14 @@
 	cmpq	%rax,%rbp
 	ja	.Lctr_enc_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 .Lctr_enc_epilogue:
 	.byte	0xf3,0xc3
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
@@ -1958,15 +1956,14 @@
 	cmpq	%rax,%rbp
 	ja	.Lxts_enc_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 .Lxts_enc_epilogue:
 	.byte	0xf3,0xc3
 .size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
@@ -2437,15 +2434,14 @@
 	cmpq	%rax,%rbp
 	ja	.Lxts_dec_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 .Lxts_dec_epilogue:
 	.byte	0xf3,0xc3
 .size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
diff --git a/linux-x86_64/crypto/bn/x86_64-mont.S b/linux-x86_64/crypto/bn/x86_64-mont.S
index 83926ad..0d2cea2 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont.S
@@ -9,6 +9,10 @@
 .type	bn_mul_mont,@function
 .align	16
 bn_mul_mont:
+.cfi_startproc	
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	testl	$3,%r9d
 	jnz	.Lmul_enter
 	cmpl	$8,%r9d
@@ -22,20 +26,50 @@
 .align	16
 .Lmul_enter:
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
 
-	movl	%r9d,%r9d
-	leaq	2(%r9),%r10
+	negq	%r9
 	movq	%rsp,%r11
-	negq	%r10
-	leaq	(%rsp,%r10,8),%rsp
-	andq	$-1024,%rsp
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
 
-	movq	%r11,8(%rsp,%r9,8)
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
 .Lmul_body:
 	movq	%rdx,%r12
 	movq	(%r8),%r8
@@ -187,51 +221,86 @@
 
 	sbbq	$0,%rax
 	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
 	movq	%r9,%r15
+	orq	%rcx,%rsi
 .align	16
 .Lcopy:
-	movq	(%rsp,%r14,8),%rsi
-	movq	(%rdi,%r14,8),%rcx
-	xorq	%rcx,%rsi
-	andq	%rax,%rsi
-	xorq	%rcx,%rsi
+	movq	(%rsi,%r14,8),%rax
 	movq	%r14,(%rsp,%r14,8)
-	movq	%rsi,(%rdi,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
 	leaq	1(%r14),%r14
 	subq	$1,%r15
 	jnz	.Lcopy
 
 	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
 	movq	$1,%rax
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_mul_mont,.-bn_mul_mont
 .type	bn_mul4x_mont,@function
 .align	16
 bn_mul4x_mont:
+.cfi_startproc	
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
 .Lmul4x_enter:
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
 
-	movl	%r9d,%r9d
-	leaq	4(%r9),%r10
+	negq	%r9
 	movq	%rsp,%r11
-	negq	%r10
-	leaq	(%rsp,%r10,8),%rsp
-	andq	$-1024,%rsp
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
 
-	movq	%r11,8(%rsp,%r9,8)
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
 .Lmul4x_body:
 	movq	%rdi,16(%rsp,%r9,8)
 	movq	%rdx,%r12
@@ -531,9 +600,11 @@
 	cmpq	%r9,%r14
 	jb	.Louter4x
 	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
 	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
 	movq	8(%rsp),%rdx
-	shrq	$2,%r9
+	shrq	$2,%r15
 	leaq	(%rsp),%rsi
 	xorq	%r14,%r14
 
@@ -541,7 +612,6 @@
 	movq	16(%rsi),%rbx
 	movq	24(%rsi),%rbp
 	sbbq	8(%rcx),%rdx
-	leaq	-1(%r9),%r15
 	jmp	.Lsub4x
 .align	16
 .Lsub4x:
@@ -569,47 +639,55 @@
 	movq	%rbx,16(%rdi,%r14,8)
 
 	sbbq	$0,%rax
-	movq	%rax,%xmm0
-	punpcklqdq	%xmm0,%xmm0
 	movq	%rbp,24(%rdi,%r14,8)
 	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-4(%r9),%r15
+	orq	%rcx,%rsi
+	shrq	$2,%r15
 
-	movq	%r9,%r15
-	pxor	%xmm5,%xmm5
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
 	jmp	.Lcopy4x
 .align	16
 .Lcopy4x:
-	movdqu	(%rsp,%r14,1),%xmm2
-	movdqu	16(%rsp,%r14,1),%xmm4
-	movdqu	(%rdi,%r14,1),%xmm1
-	movdqu	16(%rdi,%r14,1),%xmm3
-	pxor	%xmm1,%xmm2
-	pxor	%xmm3,%xmm4
-	pand	%xmm0,%xmm2
-	pand	%xmm0,%xmm4
-	pxor	%xmm1,%xmm2
-	pxor	%xmm3,%xmm4
-	movdqu	%xmm2,(%rdi,%r14,1)
-	movdqu	%xmm4,16(%rdi,%r14,1)
-	movdqa	%xmm5,(%rsp,%r14,1)
-	movdqa	%xmm5,16(%rsp,%r14,1)
-
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
 	leaq	32(%r14),%r14
 	decq	%r15
 	jnz	.Lcopy4x
 
-	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
 	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi, 8
 	movq	$1,%rax
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+.cfi_restore	%r15
+	movq	-40(%rsi),%r14
+.cfi_restore	%r14
+	movq	-32(%rsi),%r13
+.cfi_restore	%r13
+	movq	-24(%rsi),%r12
+.cfi_restore	%r12
+	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul4x_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_mul4x_mont,.-bn_mul4x_mont
 .extern	bn_sqr8x_internal
 .hidden bn_sqr8x_internal
@@ -617,14 +695,23 @@
 .type	bn_sqr8x_mont,@function
 .align	32
 bn_sqr8x_mont:
-.Lsqr8x_enter:
+.cfi_startproc	
 	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lsqr8x_enter:
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
+.Lsqr8x_prologue:
 
 	movl	%r9d,%r10d
 	shll	$3,%r9d
@@ -637,30 +724,49 @@
 
 
 	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	movq	(%r8),%r8
 	subq	%rsi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	.Lsqr8x_sp_alt
-	subq	%r11,%rsp
-	leaq	-64(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
 	jmp	.Lsqr8x_sp_done
 
 .align	32
 .Lsqr8x_sp_alt:
 	leaq	4096-64(,%r9,2),%r10
-	leaq	-64(%rsp,%r9,2),%rsp
+	leaq	-64(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 .Lsqr8x_sp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
+
+.align	16
+.Lsqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
 	movq	%r9,%r10
 	negq	%r9
 
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 .Lsqr8x_body:
 
 .byte	102,72,15,110,209
@@ -707,6 +813,7 @@
 	pxor	%xmm0,%xmm0
 	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
 	jmp	.Lsqr8x_cond_copy
 
 .align	32
@@ -736,14 +843,22 @@
 
 	movq	$1,%rax
 	movq	-48(%rsi),%r15
+.cfi_restore	%r15
 	movq	-40(%rsi),%r14
+.cfi_restore	%r14
 	movq	-32(%rsi),%r13
+.cfi_restore	%r13
 	movq	-24(%rsi),%r12
+.cfi_restore	%r12
 	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lsqr8x_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_sqr8x_mont,.-bn_sqr8x_mont
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	16
diff --git a/linux-x86_64/crypto/bn/x86_64-mont5.S b/linux-x86_64/crypto/bn/x86_64-mont5.S
index 5d7502c..33ca3c4 100644
--- a/linux-x86_64/crypto/bn/x86_64-mont5.S
+++ b/linux-x86_64/crypto/bn/x86_64-mont5.S
@@ -9,30 +9,64 @@
 .type	bn_mul_mont_gather5,@function
 .align	64
 bn_mul_mont_gather5:
+.cfi_startproc	
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	testl	$7,%r9d
 	jnz	.Lmul_enter
 	jmp	.Lmul4x_enter
 
 .align	16
 .Lmul_enter:
-	movl	%r9d,%r9d
-	movq	%rsp,%rax
 	movd	8(%rsp),%xmm5
-	leaq	.Linc(%rip),%r10
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
 
-	leaq	2(%r9),%r11
-	negq	%r11
-	leaq	-264(%rsp,%r11,8),%rsp
-	andq	$-1024,%rsp
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-280(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
 
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.Lmul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	leaq	.Linc(%rip),%r10
 	movq	%rax,8(%rsp,%r9,8)
+.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
 .Lmul_body:
+
 	leaq	128(%rdx),%r12
 	movdqa	0(%r10),%xmm0
 	movdqa	16(%r10),%xmm1
@@ -371,45 +405,64 @@
 
 	sbbq	$0,%rax
 	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
 	movq	%r9,%r15
+	orq	%rcx,%rsi
 .align	16
 .Lcopy:
-	movq	(%rsp,%r14,8),%rsi
-	movq	(%rdi,%r14,8),%rcx
-	xorq	%rcx,%rsi
-	andq	%rax,%rsi
-	xorq	%rcx,%rsi
+	movq	(%rsi,%r14,8),%rax
 	movq	%r14,(%rsp,%r14,8)
-	movq	%rsi,(%rdi,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
 	leaq	1(%r14),%r14
 	subq	$1,%r15
 	jnz	.Lcopy
 
 	movq	8(%rsp,%r9,8),%rsi
+.cfi_def_cfa	%rsi,8
 	movq	$1,%rax
 
 	movq	-48(%rsi),%r15
+.cfi_restore	%r15
 	movq	-40(%rsi),%r14
+.cfi_restore	%r14
 	movq	-32(%rsi),%r13
+.cfi_restore	%r13
 	movq	-24(%rsi),%r12
+.cfi_restore	%r12
 	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
 .type	bn_mul4x_mont_gather5,@function
 .align	32
 bn_mul4x_mont_gather5:
-.Lmul4x_enter:
+.cfi_startproc	
 .byte	0x67
 	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmul4x_enter:
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
+.Lmul4x_prologue:
 
 .byte	0x67
 	shll	$3,%r9d
@@ -426,43 +479,70 @@
 
 
 	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	subq	%rdi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	.Lmul4xsp_alt
-	subq	%r11,%rsp
-	leaq	-320(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
 	jmp	.Lmul4xsp_done
 
 .align	32
 .Lmul4xsp_alt:
 	leaq	4096-320(,%r9,2),%r10
-	leaq	-320(%rsp,%r9,2),%rsp
+	leaq	-320(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 .Lmul4xsp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
 	negq	%r9
 
 	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 .Lmul4x_body:
 
 	call	mul4x_internal
 
 	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
 	movq	$1,%rax
 
 	movq	-48(%rsi),%r15
+.cfi_restore	%r15
 	movq	-40(%rsi),%r14
+.cfi_restore	%r14
 	movq	-32(%rsi),%r13
+.cfi_restore	%r13
 	movq	-24(%rsi),%r12
+.cfi_restore	%r12
 	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul4x_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
 
 .type	mul4x_internal,@function
@@ -995,13 +1075,22 @@
 .type	bn_power5,@function
 .align	32
 bn_power5:
+.cfi_startproc	
 	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
+.Lpower5_prologue:
 
 	shll	$3,%r9d
 	leal	(%r9,%r9,2),%r10d
@@ -1016,24 +1105,41 @@
 
 
 	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	subq	%rdi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	.Lpwr_sp_alt
-	subq	%r11,%rsp
-	leaq	-320(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
 	jmp	.Lpwr_sp_done
 
 .align	32
 .Lpwr_sp_alt:
 	leaq	4096-320(,%r9,2),%r10
-	leaq	-320(%rsp,%r9,2),%rsp
+	leaq	-320(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 .Lpwr_sp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
 	movq	%r9,%r10
 	negq	%r9
 
@@ -1048,6 +1154,7 @@
 
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 .Lpower5_body:
 .byte	102,72,15,110,207
 .byte	102,72,15,110,209
@@ -1074,16 +1181,25 @@
 	call	mul4x_internal
 
 	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
 	movq	$1,%rax
 	movq	-48(%rsi),%r15
+.cfi_restore	%r15
 	movq	-40(%rsi),%r14
+.cfi_restore	%r14
 	movq	-32(%rsi),%r13
+.cfi_restore	%r13
 	movq	-24(%rsi),%r12
+.cfi_restore	%r12
 	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lpower5_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_power5,.-bn_power5
 
 .globl	bn_sqr8x_internal
@@ -1936,14 +2052,23 @@
 .type	bn_from_mont8x,@function
 .align	32
 bn_from_mont8x:
+.cfi_startproc	
 .byte	0x67
 	movq	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	pushq	%rbx
+.cfi_offset	%rbx,-16
 	pushq	%rbp
+.cfi_offset	%rbp,-24
 	pushq	%r12
+.cfi_offset	%r12,-32
 	pushq	%r13
+.cfi_offset	%r13,-40
 	pushq	%r14
+.cfi_offset	%r14,-48
 	pushq	%r15
+.cfi_offset	%r15,-56
+.Lfrom_prologue:
 
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
@@ -1958,24 +2083,41 @@
 
 
 	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	subq	%rdi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	.Lfrom_sp_alt
-	subq	%r11,%rsp
-	leaq	-320(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
 	jmp	.Lfrom_sp_done
 
 .align	32
 .Lfrom_sp_alt:
 	leaq	4096-320(,%r9,2),%r10
-	leaq	-320(%rsp,%r9,2),%rsp
+	leaq	-320(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 .Lfrom_sp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lfrom_page_walk
+	jmp	.Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	.Lfrom_page_walk
+.Lfrom_page_walk_done:
+
 	movq	%r9,%r10
 	negq	%r9
 
@@ -1990,6 +2132,7 @@
 
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
+.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
 .Lfrom_body:
 	movq	%r9,%r11
 	leaq	48(%rsp),%rax
@@ -2025,11 +2168,12 @@
 
 	pxor	%xmm0,%xmm0
 	leaq	48(%rsp),%rax
-	movq	40(%rsp),%rsi
 	jmp	.Lfrom_mont_zero
 
 .align	32
 .Lfrom_mont_zero:
+	movq	40(%rsp),%rsi
+.cfi_def_cfa	%rsi,8
 	movdqa	%xmm0,0(%rax)
 	movdqa	%xmm0,16(%rax)
 	movdqa	%xmm0,32(%rax)
@@ -2040,14 +2184,22 @@
 
 	movq	$1,%rax
 	movq	-48(%rsi),%r15
+.cfi_restore	%r15
 	movq	-40(%rsi),%r14
+.cfi_restore	%r14
 	movq	-32(%rsi),%r13
+.cfi_restore	%r13
 	movq	-24(%rsi),%r12
+.cfi_restore	%r12
 	movq	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	movq	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	leaq	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lfrom_epilogue:
 	.byte	0xf3,0xc3
+.cfi_endproc	
 .size	bn_from_mont8x,.-bn_from_mont8x
 .globl	bn_scatter5
 .hidden bn_scatter5
diff --git a/linux-x86_64/crypto/chacha/chacha-x86_64.S b/linux-x86_64/crypto/chacha/chacha-x86_64.S
index e994940..25ec715 100644
--- a/linux-x86_64/crypto/chacha/chacha-x86_64.S
+++ b/linux-x86_64/crypto/chacha/chacha-x86_64.S
@@ -23,6 +23,15 @@
 .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
 .Lsigma:
 .byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
 .byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .globl	ChaCha20_ctr32
 .hidden ChaCha20_ctr32
@@ -42,6 +51,7 @@
 	pushq	%r14
 	pushq	%r15
 	subq	$64+24,%rsp
+.Lctr32_body:
 
 
 	movdqu	(%rcx),%xmm1
@@ -279,13 +289,14 @@
 	jnz	.Loop_tail
 
 .Ldone:
-	addq	$64+24,%rsp
-	popq	%r15
-	popq	%r14
-	popq	%r13
-	popq	%r12
-	popq	%rbp
-	popq	%rbx
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lno_data:
 	.byte	0xf3,0xc3
 .size	ChaCha20_ctr32,.-ChaCha20_ctr32
@@ -293,18 +304,12 @@
 .align	32
 ChaCha20_ssse3:
 .LChaCha20_ssse3:
+	movq	%rsp,%r9
 	cmpq	$128,%rdx
 	ja	.LChaCha20_4x
 
 .Ldo_sse3_after_all:
-	pushq	%rbx
-	pushq	%rbp
-	pushq	%r12
-	pushq	%r13
-	pushq	%r14
-	pushq	%r15
-
-	subq	$64+24,%rsp
+	subq	$64+8,%rsp
 	movdqa	.Lsigma(%rip),%xmm0
 	movdqu	(%rcx),%xmm1
 	movdqu	16(%rcx),%xmm2
@@ -316,7 +321,7 @@
 	movdqa	%xmm1,16(%rsp)
 	movdqa	%xmm2,32(%rsp)
 	movdqa	%xmm3,48(%rsp)
-	movl	$10,%ebp
+	movq	$10,%r8
 	jmp	.Loop_ssse3
 
 .align	32
@@ -326,7 +331,7 @@
 	movdqa	16(%rsp),%xmm1
 	movdqa	32(%rsp),%xmm2
 	paddd	48(%rsp),%xmm3
-	movl	$10,%ebp
+	movq	$10,%r8
 	movdqa	%xmm3,48(%rsp)
 	jmp	.Loop_ssse3
 
@@ -375,7 +380,7 @@
 	pshufd	$78,%xmm2,%xmm2
 	pshufd	$147,%xmm1,%xmm1
 	pshufd	$57,%xmm3,%xmm3
-	decl	%ebp
+	decq	%r8
 	jnz	.Loop_ssse3
 	paddd	0(%rsp),%xmm0
 	paddd	16(%rsp),%xmm1
@@ -412,31 +417,27 @@
 	movdqa	%xmm1,16(%rsp)
 	movdqa	%xmm2,32(%rsp)
 	movdqa	%xmm3,48(%rsp)
-	xorq	%rbx,%rbx
+	xorq	%r8,%r8
 
 .Loop_tail_ssse3:
-	movzbl	(%rsi,%rbx,1),%eax
-	movzbl	(%rsp,%rbx,1),%ecx
-	leaq	1(%rbx),%rbx
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
 	xorl	%ecx,%eax
-	movb	%al,-1(%rdi,%rbx,1)
+	movb	%al,-1(%rdi,%r8,1)
 	decq	%rdx
 	jnz	.Loop_tail_ssse3
 
 .Ldone_ssse3:
-	addq	$64+24,%rsp
-	popq	%r15
-	popq	%r14
-	popq	%r13
-	popq	%r12
-	popq	%rbp
-	popq	%rbx
+	leaq	(%r9),%rsp
+.Lssse3_epilogue:
 	.byte	0xf3,0xc3
 .size	ChaCha20_ssse3,.-ChaCha20_ssse3
 .type	ChaCha20_4x,@function
 .align	32
 ChaCha20_4x:
 .LChaCha20_4x:
+	movq	%rsp,%r9
 	movq	%r10,%r11
 	shrq	$32,%r10
 	testq	$32,%r10
@@ -449,8 +450,7 @@
 	je	.Ldo_sse3_after_all
 
 .Lproceed4x:
-	leaq	-120(%rsp),%r11
-	subq	$0x148+0,%rsp
+	subq	$0x140+8,%rsp
 	movdqa	.Lsigma(%rip),%xmm11
 	movdqu	(%rcx),%xmm15
 	movdqu	16(%rcx),%xmm7
@@ -977,18 +977,18 @@
 	jnz	.Loop_tail4x
 
 .Ldone4x:
-	addq	$0x148+0,%rsp
+	leaq	(%r9),%rsp
+.L4x_epilogue:
 	.byte	0xf3,0xc3
 .size	ChaCha20_4x,.-ChaCha20_4x
 .type	ChaCha20_8x,@function
 .align	32
 ChaCha20_8x:
 .LChaCha20_8x:
-	movq	%rsp,%r10
+	movq	%rsp,%r9
 	subq	$0x280+8,%rsp
 	andq	$-32,%rsp
 	vzeroupper
-	movq	%r10,640(%rsp)
 
 
 
@@ -1579,7 +1579,8 @@
 
 .Ldone8x:
 	vzeroall
-	movq	640(%rsp),%rsp
+	leaq	(%r9),%rsp
+.L8x_epilogue:
 	.byte	0xf3,0xc3
 .size	ChaCha20_8x,.-ChaCha20_8x
 #endif
diff --git a/linux-x86_64/crypto/modes/ghash-x86_64.S b/linux-x86_64/crypto/modes/ghash-x86_64.S
index b6ca45f..8842c27 100644
--- a/linux-x86_64/crypto/modes/ghash-x86_64.S
+++ b/linux-x86_64/crypto/modes/ghash-x86_64.S
@@ -11,6 +11,10 @@
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$280,%rsp
 .Lgmult_prologue:
 
 	movzbq	15(%rdi),%r8
@@ -87,8 +91,9 @@
 	movq	%r8,8(%rdi)
 	movq	%r9,(%rdi)
 
-	movq	16(%rsp),%rbx
-	leaq	24(%rsp),%rsp
+	leaq	280+48(%rsp),%rsi
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lgmult_epilogue:
 	.byte	0xf3,0xc3
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
@@ -648,14 +653,14 @@
 	movq	%r8,8(%rdi)
 	movq	%r9,(%rdi)
 
-	leaq	280(%rsp),%rsi
-	movq	0(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	leaq	280+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	0(%rsi),%rsp
 .Lghash_epilogue:
 	.byte	0xf3,0xc3
 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
diff --git a/linux-x86_64/crypto/sha/sha1-x86_64.S b/linux-x86_64/crypto/sha/sha1-x86_64.S
index d830b53..567bdfd 100644
--- a/linux-x86_64/crypto/sha/sha1-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha1-x86_64.S
@@ -1241,14 +1241,13 @@
 .align	16
 sha1_block_data_order_ssse3:
 _ssse3_shortcut:
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	leaq	-64(%rsp),%rsp
-	movq	%rax,%r14
 	andq	$-64,%rsp
 	movq	%rdi,%r8
 	movq	%rsi,%r9
@@ -1256,7 +1255,7 @@
 
 	shlq	$6,%r10
 	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r11
+	leaq	K_XX_XX+64(%rip),%r14
 
 	movl	0(%r8),%eax
 	movl	4(%r8),%ebx
@@ -1268,8 +1267,8 @@
 	xorl	%edx,%edi
 	andl	%edi,%esi
 
-	movdqa	64(%r11),%xmm6
-	movdqa	-64(%r11),%xmm9
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
 	movdqu	0(%r9),%xmm0
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
@@ -1345,7 +1344,7 @@
 	pslld	$2,%xmm9
 	pxor	%xmm10,%xmm4
 	xorl	%ebp,%edx
-	movdqa	-64(%r11),%xmm10
+	movdqa	-64(%r14),%xmm10
 	roll	$5,%ecx
 	addl	%edi,%ebx
 	andl	%edx,%esi
@@ -1406,7 +1405,7 @@
 	pslld	$2,%xmm10
 	pxor	%xmm8,%xmm5
 	xorl	%eax,%ebp
-	movdqa	-32(%r11),%xmm8
+	movdqa	-32(%r14),%xmm8
 	roll	$5,%edx
 	addl	%edi,%ecx
 	andl	%ebp,%esi
@@ -1467,7 +1466,7 @@
 	pslld	$2,%xmm8
 	pxor	%xmm9,%xmm6
 	xorl	%ebx,%eax
-	movdqa	-32(%r11),%xmm9
+	movdqa	-32(%r14),%xmm9
 	roll	$5,%ebp
 	addl	%edi,%edx
 	andl	%eax,%esi
@@ -1528,7 +1527,7 @@
 	pslld	$2,%xmm9
 	pxor	%xmm10,%xmm7
 	xorl	%ecx,%ebx
-	movdqa	-32(%r11),%xmm10
+	movdqa	-32(%r14),%xmm10
 	roll	$5,%eax
 	addl	%edi,%ebp
 	andl	%ebx,%esi
@@ -1639,7 +1638,7 @@
 	pxor	%xmm3,%xmm2
 	addl	%esi,%eax
 	xorl	%edx,%edi
-	movdqa	0(%r11),%xmm10
+	movdqa	0(%r14),%xmm10
 	rorl	$7,%ecx
 	paddd	%xmm1,%xmm9
 	addl	%ebx,%eax
@@ -1874,7 +1873,7 @@
 	pxor	%xmm0,%xmm7
 	roll	$5,%ebx
 	addl	%esi,%eax
-	movdqa	32(%r11),%xmm9
+	movdqa	32(%r14),%xmm9
 	xorl	%ecx,%edi
 	paddd	%xmm6,%xmm8
 	xorl	%edx,%ecx
@@ -2165,8 +2164,8 @@
 	addl	%edx,%ecx
 	cmpq	%r10,%r9
 	je	.Ldone_ssse3
-	movdqa	64(%r11),%xmm6
-	movdqa	-64(%r11),%xmm9
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
 	movdqu	0(%r9),%xmm0
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
@@ -2403,13 +2402,12 @@
 	movl	%ecx,8(%r8)
 	movl	%edx,12(%r8)
 	movl	%ebp,16(%r8)
-	leaq	(%r14),%rsi
-	movq	-40(%rsi),%r14
-	movq	-32(%rsi),%r13
-	movq	-24(%rsi),%r12
-	movq	-16(%rsi),%rbp
-	movq	-8(%rsi),%rbx
-	leaq	(%rsi),%rsp
+	movq	-40(%r11),%r14
+	movq	-32(%r11),%r13
+	movq	-24(%r11),%r12
+	movq	-16(%r11),%rbp
+	movq	-8(%r11),%rbx
+	leaq	(%r11),%rsp
 .Lepilogue_ssse3:
 	.byte	0xf3,0xc3
 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
@@ -2417,7 +2415,7 @@
 .align	16
 sha1_block_data_order_avx:
 _avx_shortcut:
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -2425,7 +2423,6 @@
 	pushq	%r14
 	leaq	-64(%rsp),%rsp
 	vzeroupper
-	movq	%rax,%r14
 	andq	$-64,%rsp
 	movq	%rdi,%r8
 	movq	%rsi,%r9
@@ -2433,7 +2430,7 @@
 
 	shlq	$6,%r10
 	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r11
+	leaq	K_XX_XX+64(%rip),%r14
 
 	movl	0(%r8),%eax
 	movl	4(%r8),%ebx
@@ -2445,8 +2442,8 @@
 	xorl	%edx,%edi
 	andl	%edi,%esi
 
-	vmovdqa	64(%r11),%xmm6
-	vmovdqa	-64(%r11),%xmm11
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
 	vmovdqu	0(%r9),%xmm0
 	vmovdqu	16(%r9),%xmm1
 	vmovdqu	32(%r9),%xmm2
@@ -2571,7 +2568,7 @@
 	vpxor	%xmm10,%xmm5,%xmm5
 	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	vmovdqa	-32(%r11),%xmm11
+	vmovdqa	-32(%r14),%xmm11
 	addl	%edi,%ecx
 	andl	%ebp,%esi
 	xorl	%eax,%ebp
@@ -2784,7 +2781,7 @@
 	addl	%esi,%eax
 	xorl	%edx,%edi
 	vpaddd	%xmm1,%xmm11,%xmm9
-	vmovdqa	0(%r11),%xmm11
+	vmovdqa	0(%r14),%xmm11
 	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
 	vpxor	%xmm8,%xmm2,%xmm2
@@ -3003,7 +3000,7 @@
 	movl	%ebx,%edi
 	xorl	%edx,%esi
 	vpaddd	%xmm6,%xmm11,%xmm9
-	vmovdqa	32(%r11),%xmm11
+	vmovdqa	32(%r14),%xmm11
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
 	vpxor	%xmm8,%xmm7,%xmm7
@@ -3282,8 +3279,8 @@
 	addl	%edx,%ecx
 	cmpq	%r10,%r9
 	je	.Ldone_avx
-	vmovdqa	64(%r11),%xmm6
-	vmovdqa	-64(%r11),%xmm11
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
 	vmovdqu	0(%r9),%xmm0
 	vmovdqu	16(%r9),%xmm1
 	vmovdqu	32(%r9),%xmm2
@@ -3519,13 +3516,12 @@
 	movl	%ecx,8(%r8)
 	movl	%edx,12(%r8)
 	movl	%ebp,16(%r8)
-	leaq	(%r14),%rsi
-	movq	-40(%rsi),%r14
-	movq	-32(%rsi),%r13
-	movq	-24(%rsi),%r12
-	movq	-16(%rsi),%rbp
-	movq	-8(%rsi),%rbx
-	leaq	(%rsi),%rsp
+	movq	-40(%r11),%r14
+	movq	-32(%r11),%r13
+	movq	-24(%r11),%r12
+	movq	-16(%r11),%rbp
+	movq	-8(%r11),%rbx
+	leaq	(%r11),%rsp
 .Lepilogue_avx:
 	.byte	0xf3,0xc3
 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
diff --git a/linux-x86_64/crypto/sha/sha256-x86_64.S b/linux-x86_64/crypto/sha/sha256-x86_64.S
index 445b497..273b7a5 100644
--- a/linux-x86_64/crypto/sha/sha256-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha256-x86_64.S
@@ -19,13 +19,13 @@
 	je	.Lavx_shortcut
 	testl	$512,%r10d
 	jnz	.Lssse3_shortcut
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$64+32,%rsp
 	leaq	(%rsi,%rdx,4),%rdx
@@ -33,7 +33,7 @@
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-	movq	%r11,64+24(%rsp)
+	movq	%rax,64+24(%rsp)
 .Lprologue:
 
 	movl	0(%rdi),%eax
@@ -1698,13 +1698,13 @@
 	jb	.Lloop
 
 	movq	64+24(%rsp),%rsi
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue:
 	.byte	0xf3,0xc3
 .size	sha256_block_data_order,.-sha256_block_data_order
@@ -1755,13 +1755,13 @@
 .align	64
 sha256_block_data_order_ssse3:
 .Lssse3_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$96,%rsp
 	leaq	(%rsi,%rdx,4),%rdx
@@ -1769,7 +1769,7 @@
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-	movq	%r11,64+24(%rsp)
+	movq	%rax,64+24(%rsp)
 .Lprologue_ssse3:
 
 	movl	0(%rdi),%eax
@@ -2836,13 +2836,13 @@
 	jb	.Lloop_ssse3
 
 	movq	64+24(%rsp),%rsi
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue_ssse3:
 	.byte	0xf3,0xc3
 .size	sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
@@ -2850,13 +2850,13 @@
 .align	64
 sha256_block_data_order_avx:
 .Lavx_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$96,%rsp
 	leaq	(%rsi,%rdx,4),%rdx
@@ -2864,7 +2864,7 @@
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-	movq	%r11,64+24(%rsp)
+	movq	%rax,64+24(%rsp)
 .Lprologue_avx:
 
 	vzeroupper
@@ -3893,13 +3893,13 @@
 
 	movq	64+24(%rsp),%rsi
 	vzeroupper
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue_avx:
 	.byte	0xf3,0xc3
 .size	sha256_block_data_order_avx,.-sha256_block_data_order_avx
diff --git a/linux-x86_64/crypto/sha/sha512-x86_64.S b/linux-x86_64/crypto/sha/sha512-x86_64.S
index d65743f..f272b64 100644
--- a/linux-x86_64/crypto/sha/sha512-x86_64.S
+++ b/linux-x86_64/crypto/sha/sha512-x86_64.S
@@ -19,13 +19,13 @@
 	orl	%r9d,%r10d
 	cmpl	$1342177792,%r10d
 	je	.Lavx_shortcut
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$128+32,%rsp
 	leaq	(%rsi,%rdx,8),%rdx
@@ -33,7 +33,7 @@
 	movq	%rdi,128+0(%rsp)
 	movq	%rsi,128+8(%rsp)
 	movq	%rdx,128+16(%rsp)
-	movq	%r11,128+24(%rsp)
+	movq	%rax,128+24(%rsp)
 .Lprologue:
 
 	movq	0(%rdi),%rax
@@ -1698,13 +1698,13 @@
 	jb	.Lloop
 
 	movq	128+24(%rsp),%rsi
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue:
 	.byte	0xf3,0xc3
 .size	sha512_block_data_order,.-sha512_block_data_order
@@ -1799,13 +1799,13 @@
 .align	64
 sha512_block_data_order_xop:
 .Lxop_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$160,%rsp
 	leaq	(%rsi,%rdx,8),%rdx
@@ -1813,7 +1813,7 @@
 	movq	%rdi,128+0(%rsp)
 	movq	%rsi,128+8(%rsp)
 	movq	%rdx,128+16(%rsp)
-	movq	%r11,128+24(%rsp)
+	movq	%rax,128+24(%rsp)
 .Lprologue_xop:
 
 	vzeroupper
@@ -2868,13 +2868,13 @@
 
 	movq	128+24(%rsp),%rsi
 	vzeroupper
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue_xop:
 	.byte	0xf3,0xc3
 .size	sha512_block_data_order_xop,.-sha512_block_data_order_xop
@@ -2882,13 +2882,13 @@
 .align	64
 sha512_block_data_order_avx:
 .Lavx_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$160,%rsp
 	leaq	(%rsi,%rdx,8),%rdx
@@ -2896,7 +2896,7 @@
 	movq	%rdi,128+0(%rsp)
 	movq	%rsi,128+8(%rsp)
 	movq	%rdx,128+16(%rsp)
-	movq	%r11,128+24(%rsp)
+	movq	%rax,128+24(%rsp)
 .Lprologue_avx:
 
 	vzeroupper
@@ -4015,13 +4015,13 @@
 
 	movq	128+24(%rsp),%rsi
 	vzeroupper
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 .Lepilogue_avx:
 	.byte	0xf3,0xc3
 .size	sha512_block_data_order_avx,.-sha512_block_data_order_avx
diff --git a/mac-x86/crypto/bn/x86-mont.S b/mac-x86/crypto/bn/x86-mont.S
index 234034b..5c13ca4 100644
--- a/mac-x86/crypto/bn/x86-mont.S
+++ b/mac-x86/crypto/bn/x86-mont.S
@@ -16,39 +16,54 @@
 	jl	L000just_leave
 	leal	20(%esp),%esi
 	leal	24(%esp),%edx
-	movl	%esp,%ebp
 	addl	$2,%edi
 	negl	%edi
-	leal	-32(%esp,%edi,4),%esp
+	leal	-32(%esp,%edi,4),%ebp
 	negl	%edi
-	movl	%esp,%eax
+	movl	%ebp,%eax
 	subl	%edx,%eax
 	andl	$2047,%eax
-	subl	%eax,%esp
-	xorl	%esp,%edx
+	subl	%eax,%ebp
+	xorl	%ebp,%edx
 	andl	$2048,%edx
 	xorl	$2048,%edx
-	subl	%edx,%esp
-	andl	$-64,%esp
+	subl	%edx,%ebp
+	andl	$-64,%ebp
+	movl	%esp,%eax
+	subl	%ebp,%eax
+	andl	$-4096,%eax
+	movl	%esp,%edx
+	leal	(%ebp,%eax,1),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	L001page_walk
+	jmp	L002page_walk_done
+.align	4,0x90
+L001page_walk:
+	leal	-4096(%esp),%esp
+	movl	(%esp),%eax
+	cmpl	%ebp,%esp
+	ja	L001page_walk
+L002page_walk_done:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
-	movl	12(%esi),%edx
+	movl	12(%esi),%ebp
 	movl	16(%esi),%esi
 	movl	(%esi),%esi
 	movl	%eax,4(%esp)
 	movl	%ebx,8(%esp)
 	movl	%ecx,12(%esp)
-	movl	%edx,16(%esp)
+	movl	%ebp,16(%esp)
 	movl	%esi,20(%esp)
 	leal	-3(%edi),%ebx
-	movl	%ebp,24(%esp)
-	call	L001PIC_me_up
-L001PIC_me_up:
+	movl	%edx,24(%esp)
+	call	L003PIC_me_up
+L003PIC_me_up:
 	popl	%eax
-	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax
+	movl	L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax
 	btl	$26,(%eax)
-	jnc	L002non_sse2
+	jnc	L004non_sse2
 	movl	$-1,%eax
 	movd	%eax,%mm7
 	movl	8(%esp),%esi
@@ -72,7 +87,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	4,0x90
-L0031st:
+L0051st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -87,7 +102,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	L0031st
+	jl	L0051st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -101,7 +116,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-L004outer:
+L006outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -123,7 +138,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-L005inner:
+L007inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -140,7 +155,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	L005inner
+	jnz	L007inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -158,11 +173,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	L004outer
+	jle	L006outer
 	emms
-	jmp	L006common_tail
+	jmp	L008common_tail
 .align	4,0x90
-L002non_sse2:
+L004non_sse2:
 	movl	8(%esp),%esi
 	leal	1(%ebx),%ebp
 	movl	12(%esp),%edi
@@ -173,12 +188,12 @@
 	leal	4(%edi,%ebx,4),%eax
 	orl	%edx,%ebp
 	movl	(%edi),%edi
-	jz	L007bn_sqr_mont
+	jz	L009bn_sqr_mont
 	movl	%eax,28(%esp)
 	movl	(%esi),%eax
 	xorl	%edx,%edx
 .align	4,0x90
-L008mull:
+L010mull:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	%eax,%ebp
@@ -187,7 +202,7 @@
 	movl	(%esi,%ecx,4),%eax
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	L008mull
+	jl	L010mull
 	movl	%edx,%ebp
 	mull	%edi
 	movl	20(%esp),%edi
@@ -205,9 +220,9 @@
 	movl	4(%esi),%eax
 	adcl	$0,%edx
 	incl	%ecx
-	jmp	L0092ndmadd
+	jmp	L0112ndmadd
 .align	4,0x90
-L0101stmadd:
+L0121stmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -218,7 +233,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	L0101stmadd
+	jl	L0121stmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%eax
@@ -241,7 +256,7 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 .align	4,0x90
-L0092ndmadd:
+L0112ndmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -252,7 +267,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	L0092ndmadd
+	jl	L0112ndmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -268,16 +283,16 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	28(%esp),%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	L006common_tail
+	je	L008common_tail
 	movl	(%ecx),%edi
 	movl	8(%esp),%esi
 	movl	%ecx,12(%esp)
 	xorl	%ecx,%ecx
 	xorl	%edx,%edx
 	movl	(%esi),%eax
-	jmp	L0101stmadd
+	jmp	L0121stmadd
 .align	4,0x90
-L007bn_sqr_mont:
+L009bn_sqr_mont:
 	movl	%ebx,(%esp)
 	movl	%ecx,12(%esp)
 	movl	%edi,%eax
@@ -288,7 +303,7 @@
 	andl	$1,%ebx
 	incl	%ecx
 .align	4,0x90
-L011sqr:
+L013sqr:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -300,7 +315,7 @@
 	cmpl	(%esp),%ecx
 	movl	%eax,%ebx
 	movl	%ebp,28(%esp,%ecx,4)
-	jl	L011sqr
+	jl	L013sqr
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -324,7 +339,7 @@
 	movl	4(%esi),%eax
 	movl	$1,%ecx
 .align	4,0x90
-L0123rdmadd:
+L0143rdmadd:
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ecx,4),%ebp
@@ -343,7 +358,7 @@
 	adcl	$0,%edx
 	cmpl	%ebx,%ecx
 	movl	%ebp,24(%esp,%ecx,4)
-	jl	L0123rdmadd
+	jl	L0143rdmadd
 	movl	%edx,%ebp
 	mull	%edi
 	addl	32(%esp,%ebx,4),%ebp
@@ -359,7 +374,7 @@
 	movl	%edx,32(%esp,%ebx,4)
 	cmpl	%ebx,%ecx
 	movl	%eax,36(%esp,%ebx,4)
-	je	L006common_tail
+	je	L008common_tail
 	movl	4(%esi,%ecx,4),%edi
 	leal	1(%ecx),%ecx
 	movl	%edi,%eax
@@ -371,12 +386,12 @@
 	xorl	%ebp,%ebp
 	cmpl	%ebx,%ecx
 	leal	1(%ecx),%ecx
-	je	L013sqrlast
+	je	L015sqrlast
 	movl	%edx,%ebx
 	shrl	$1,%edx
 	andl	$1,%ebx
 .align	4,0x90
-L014sqradd:
+L016sqradd:
 	movl	(%esi,%ecx,4),%eax
 	movl	%edx,%ebp
 	mull	%edi
@@ -392,13 +407,13 @@
 	cmpl	(%esp),%ecx
 	movl	%ebp,28(%esp,%ecx,4)
 	movl	%eax,%ebx
-	jle	L014sqradd
+	jle	L016sqradd
 	movl	%edx,%ebp
 	addl	%edx,%edx
 	shrl	$31,%ebp
 	addl	%ebx,%edx
 	adcl	$0,%ebp
-L013sqrlast:
+L015sqrlast:
 	movl	20(%esp),%edi
 	movl	16(%esp),%esi
 	imull	32(%esp),%edi
@@ -413,9 +428,9 @@
 	adcl	$0,%edx
 	movl	$1,%ecx
 	movl	4(%esi),%eax
-	jmp	L0123rdmadd
+	jmp	L0143rdmadd
 .align	4,0x90
-L006common_tail:
+L008common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -423,25 +438,26 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	4,0x90
-L015sub:
+L017sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	L015sub
+	jge	L017sub
 	sbbl	$0,%eax
+	andl	%eax,%esi
+	notl	%eax
+	movl	%edi,%ebp
+	andl	%eax,%ebp
+	orl	%ebp,%esi
 .align	4,0x90
-L016copy:
-	movl	(%esi,%ebx,4),%edx
-	movl	(%edi,%ebx,4),%ebp
-	xorl	%ebp,%edx
-	andl	%eax,%edx
-	xorl	%ebp,%edx
-	movl	%ecx,(%esi,%ebx,4)
-	movl	%edx,(%edi,%ebx,4)
+L018copy:
+	movl	(%esi,%ebx,4),%eax
+	movl	%eax,(%edi,%ebx,4)
+	movl	%ecx,32(%esp,%ebx,4)
 	decl	%ebx
-	jge	L016copy
+	jge	L018copy
 	movl	24(%esp),%esp
 	movl	$1,%eax
 L000just_leave:
diff --git a/mac-x86_64/crypto/aes/aes-x86_64.S b/mac-x86_64/crypto/aes/aes-x86_64.S
index b5d188a..52df2ae 100644
--- a/mac-x86_64/crypto/aes/aes-x86_64.S
+++ b/mac-x86_64/crypto/aes/aes-x86_64.S
@@ -332,6 +332,7 @@
 
 .private_extern	_asm_AES_encrypt
 _asm_AES_encrypt:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -340,7 +341,6 @@
 	pushq	%r15
 
 
-	movq	%rsp,%r10
 	leaq	-63(%rdx),%rcx
 	andq	$-64,%rsp
 	subq	%rsp,%rcx
@@ -350,7 +350,7 @@
 	subq	$32,%rsp
 
 	movq	%rsi,16(%rsp)
-	movq	%r10,24(%rsp)
+	movq	%rax,24(%rsp)
 L$enc_prologue:
 
 	movq	%rdx,%r15
@@ -382,13 +382,13 @@
 	movl	%ecx,8(%r9)
 	movl	%edx,12(%r9)
 
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$enc_epilogue:
 	.byte	0xf3,0xc3
 
@@ -778,6 +778,7 @@
 
 .private_extern	_asm_AES_decrypt
 _asm_AES_decrypt:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -786,7 +787,6 @@
 	pushq	%r15
 
 
-	movq	%rsp,%r10
 	leaq	-63(%rdx),%rcx
 	andq	$-64,%rsp
 	subq	%rsp,%rcx
@@ -796,7 +796,7 @@
 	subq	$32,%rsp
 
 	movq	%rsi,16(%rsp)
-	movq	%r10,24(%rsp)
+	movq	%rax,24(%rsp)
 L$dec_prologue:
 
 	movq	%rdx,%r15
@@ -830,13 +830,13 @@
 	movl	%ecx,8(%r9)
 	movl	%edx,12(%r9)
 
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$dec_epilogue:
 	.byte	0xf3,0xc3
 
@@ -1312,10 +1312,9 @@
 	movl	%r9d,%r9d
 
 	leaq	L$AES_Te(%rip),%r14
+	leaq	L$AES_Td(%rip),%r10
 	cmpq	$0,%r9
-	jne	L$cbc_picked_te
-	leaq	L$AES_Td(%rip),%r14
-L$cbc_picked_te:
+	cmoveq	%r10,%r14
 
 	movl	_OPENSSL_ia32cap_P(%rip),%r10d
 	cmpq	$512,%rdx
diff --git a/mac-x86_64/crypto/aes/aesni-x86_64.S b/mac-x86_64/crypto/aes/aesni-x86_64.S
index 3d98fa1..4e3b7d0 100644
--- a/mac-x86_64/crypto/aes/aesni-x86_64.S
+++ b/mac-x86_64/crypto/aes/aesni-x86_64.S
@@ -1031,11 +1031,10 @@
 
 .p2align	4
 L$ctr32_bulk:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$128,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
 
 
 
@@ -1044,7 +1043,7 @@
 	movdqu	(%rcx),%xmm0
 	movl	12(%r8),%r8d
 	pxor	%xmm0,%xmm2
-	movl	12(%rcx),%r11d
+	movl	12(%rcx),%ebp
 	movdqa	%xmm2,0(%rsp)
 	bswapl	%r8d
 	movdqa	%xmm2,%xmm3
@@ -1060,8 +1059,8 @@
 	leaq	2(%r8),%rdx
 	bswapl	%eax
 	bswapl	%edx
-	xorl	%r11d,%eax
-	xorl	%r11d,%edx
+	xorl	%ebp,%eax
+	xorl	%ebp,%edx
 .byte	102,15,58,34,216,3
 	leaq	3(%r8),%rax
 	movdqa	%xmm3,16(%rsp)
@@ -1070,25 +1069,25 @@
 	movq	%r10,%rdx
 	leaq	4(%r8),%r10
 	movdqa	%xmm4,32(%rsp)
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 	bswapl	%r10d
 .byte	102,15,58,34,232,3
-	xorl	%r11d,%r10d
+	xorl	%ebp,%r10d
 	movdqa	%xmm5,48(%rsp)
 	leaq	5(%r8),%r9
 	movl	%r10d,64+12(%rsp)
 	bswapl	%r9d
 	leaq	6(%r8),%r10
 	movl	240(%rcx),%eax
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	bswapl	%r10d
 	movl	%r9d,80+12(%rsp)
-	xorl	%r11d,%r10d
+	xorl	%ebp,%r10d
 	leaq	7(%r8),%r9
 	movl	%r10d,96+12(%rsp)
 	bswapl	%r9d
 	movl	_OPENSSL_ia32cap_P+4(%rip),%r10d
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	andl	$71303168,%r10d
 	movl	%r9d,112+12(%rsp)
 
@@ -1112,7 +1111,7 @@
 L$ctr32_6x:
 	shll	$4,%eax
 	movl	$48,%r10d
-	bswapl	%r11d
+	bswapl	%ebp
 	leaq	32(%rcx,%rax,1),%rcx
 	subq	%rax,%r10
 	jmp	L$ctr32_loop6
@@ -1123,32 +1122,32 @@
 	movups	-48(%rcx,%r10,1),%xmm0
 .byte	102,15,56,220,209
 	movl	%r8d,%eax
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	102,15,56,220,217
 .byte	0x0f,0x38,0xf1,0x44,0x24,12
 	leal	1(%r8),%eax
 .byte	102,15,56,220,225
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	0x0f,0x38,0xf1,0x44,0x24,28
 .byte	102,15,56,220,233
 	leal	2(%r8),%eax
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	102,15,56,220,241
 .byte	0x0f,0x38,0xf1,0x44,0x24,44
 	leal	3(%r8),%eax
 .byte	102,15,56,220,249
 	movups	-32(%rcx,%r10,1),%xmm1
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 
 .byte	102,15,56,220,208
 .byte	0x0f,0x38,0xf1,0x44,0x24,60
 	leal	4(%r8),%eax
 .byte	102,15,56,220,216
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	0x0f,0x38,0xf1,0x44,0x24,76
 .byte	102,15,56,220,224
 	leal	5(%r8),%eax
-	xorl	%r11d,%eax
+	xorl	%ebp,%eax
 .byte	102,15,56,220,232
 .byte	0x0f,0x38,0xf1,0x44,0x24,92
 	movq	%r10,%rax
@@ -1209,7 +1208,7 @@
 	bswapl	%r9d
 	movups	32-128(%rcx),%xmm0
 .byte	102,15,56,220,225
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	nop
 .byte	102,15,56,220,233
 	movl	%r9d,0+12(%rsp)
@@ -1222,7 +1221,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
@@ -1236,7 +1235,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
@@ -1250,7 +1249,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
@@ -1264,7 +1263,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
@@ -1278,7 +1277,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,224
 .byte	102,15,56,220,232
@@ -1292,7 +1291,7 @@
 	bswapl	%r9d
 .byte	102,15,56,220,209
 .byte	102,15,56,220,217
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 .byte	0x66,0x90
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
@@ -1307,7 +1306,7 @@
 .byte	102,15,56,220,208
 .byte	102,15,56,220,216
 .byte	102,15,56,220,224
-	xorl	%r11d,%r9d
+	xorl	%ebp,%r9d
 	movdqu	0(%rdi),%xmm10
 .byte	102,15,56,220,232
 	movl	%r9d,112+12(%rsp)
@@ -1542,7 +1541,7 @@
 
 L$ctr32_done:
 	xorps	%xmm0,%xmm0
-	xorl	%r11d,%r11d
+	xorl	%ebp,%ebp
 	pxor	%xmm1,%xmm1
 	pxor	%xmm2,%xmm2
 	pxor	%xmm3,%xmm3
@@ -1566,8 +1565,8 @@
 	pxor	%xmm14,%xmm14
 	movaps	%xmm0,112(%rsp)
 	pxor	%xmm15,%xmm15
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 L$ctr32_epilogue:
 	.byte	0xf3,0xc3
 
@@ -1576,11 +1575,10 @@
 
 .p2align	4
 _aesni_xts_encrypt:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$112,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
 	movups	(%r9),%xmm2
 	movl	240(%r8),%eax
 	movl	240(%rcx),%r10d
@@ -1596,7 +1594,7 @@
 	jnz	L$oop_enc1_8
 .byte	102,15,56,221,209
 	movups	(%rcx),%xmm0
-	movq	%rcx,%r11
+	movq	%rcx,%rbp
 	movl	%r10d,%eax
 	shll	$4,%r10d
 	movq	%rdx,%r9
@@ -1652,9 +1650,9 @@
 	jc	L$xts_enc_short
 
 	movl	$16+96,%eax
-	leaq	32(%r11,%r10,1),%rcx
+	leaq	32(%rbp,%r10,1),%rcx
 	subq	%r10,%rax
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 	movq	%rax,%r10
 	leaq	L$xts_magic(%rip),%r8
 	jmp	L$xts_enc_grandloop
@@ -1679,7 +1677,7 @@
 	movdqa	96(%rsp),%xmm9
 	pxor	%xmm14,%xmm6
 .byte	102,15,56,220,233
-	movups	32(%r11),%xmm0
+	movups	32(%rbp),%xmm0
 	leaq	96(%rdi),%rdi
 	pxor	%xmm8,%xmm7
 
@@ -1688,7 +1686,7 @@
 	pxor	%xmm9,%xmm11
 	movdqa	%xmm10,0(%rsp)
 .byte	102,15,56,220,249
-	movups	48(%r11),%xmm1
+	movups	48(%rbp),%xmm1
 	pxor	%xmm9,%xmm12
 
 .byte	102,15,56,220,208
@@ -1703,7 +1701,7 @@
 	movdqa	%xmm14,64(%rsp)
 .byte	102,15,56,220,240
 .byte	102,15,56,220,248
-	movups	64(%r11),%xmm0
+	movups	64(%rbp),%xmm0
 	movdqa	%xmm8,80(%rsp)
 	pshufd	$0x5f,%xmm15,%xmm9
 	jmp	L$xts_enc_loop6
@@ -1735,7 +1733,7 @@
 	psrad	$31,%xmm14
 .byte	102,15,56,220,217
 	pand	%xmm8,%xmm14
-	movups	(%r11),%xmm10
+	movups	(%rbp),%xmm10
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
 .byte	102,15,56,220,241
@@ -1803,10 +1801,10 @@
 .byte	102,15,56,220,225
 .byte	102,15,56,220,233
 	pxor	%xmm0,%xmm15
-	movups	(%r11),%xmm0
+	movups	(%rbp),%xmm0
 .byte	102,15,56,220,241
 .byte	102,15,56,220,249
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 
 	pxor	%xmm15,%xmm14
 .byte	102,15,56,221,84,36,0
@@ -1833,7 +1831,7 @@
 
 	movl	$16+96,%eax
 	subl	%r10d,%eax
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	shrl	$4,%eax
 
 L$xts_enc_short:
@@ -1989,7 +1987,7 @@
 	jnz	L$xts_enc_steal
 
 	subq	%r9,%rsi
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movl	%r10d,%eax
 
 	movups	-16(%rsi),%xmm2
@@ -2032,8 +2030,8 @@
 	movaps	%xmm0,96(%rsp)
 	pxor	%xmm14,%xmm14
 	pxor	%xmm15,%xmm15
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 L$xts_enc_epilogue:
 	.byte	0xf3,0xc3
 
@@ -2042,11 +2040,10 @@
 
 .p2align	4
 _aesni_xts_decrypt:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$112,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
 	movups	(%r9),%xmm2
 	movl	240(%r8),%eax
 	movl	240(%rcx),%r10d
@@ -2068,7 +2065,7 @@
 	subq	%rax,%rdx
 
 	movups	(%rcx),%xmm0
-	movq	%rcx,%r11
+	movq	%rcx,%rbp
 	movl	%r10d,%eax
 	shll	$4,%r10d
 	movq	%rdx,%r9
@@ -2124,9 +2121,9 @@
 	jc	L$xts_dec_short
 
 	movl	$16+96,%eax
-	leaq	32(%r11,%r10,1),%rcx
+	leaq	32(%rbp,%r10,1),%rcx
 	subq	%r10,%rax
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 	movq	%rax,%r10
 	leaq	L$xts_magic(%rip),%r8
 	jmp	L$xts_dec_grandloop
@@ -2151,7 +2148,7 @@
 	movdqa	96(%rsp),%xmm9
 	pxor	%xmm14,%xmm6
 .byte	102,15,56,222,233
-	movups	32(%r11),%xmm0
+	movups	32(%rbp),%xmm0
 	leaq	96(%rdi),%rdi
 	pxor	%xmm8,%xmm7
 
@@ -2160,7 +2157,7 @@
 	pxor	%xmm9,%xmm11
 	movdqa	%xmm10,0(%rsp)
 .byte	102,15,56,222,249
-	movups	48(%r11),%xmm1
+	movups	48(%rbp),%xmm1
 	pxor	%xmm9,%xmm12
 
 .byte	102,15,56,222,208
@@ -2175,7 +2172,7 @@
 	movdqa	%xmm14,64(%rsp)
 .byte	102,15,56,222,240
 .byte	102,15,56,222,248
-	movups	64(%r11),%xmm0
+	movups	64(%rbp),%xmm0
 	movdqa	%xmm8,80(%rsp)
 	pshufd	$0x5f,%xmm15,%xmm9
 	jmp	L$xts_dec_loop6
@@ -2207,7 +2204,7 @@
 	psrad	$31,%xmm14
 .byte	102,15,56,222,217
 	pand	%xmm8,%xmm14
-	movups	(%r11),%xmm10
+	movups	(%rbp),%xmm10
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
 .byte	102,15,56,222,241
@@ -2275,10 +2272,10 @@
 .byte	102,15,56,222,225
 .byte	102,15,56,222,233
 	pxor	%xmm0,%xmm15
-	movups	(%r11),%xmm0
+	movups	(%rbp),%xmm0
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
-	movups	16(%r11),%xmm1
+	movups	16(%rbp),%xmm1
 
 	pxor	%xmm15,%xmm14
 .byte	102,15,56,223,84,36,0
@@ -2305,7 +2302,7 @@
 
 	movl	$16+96,%eax
 	subl	%r10d,%eax
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	shrl	$4,%eax
 
 L$xts_dec_short:
@@ -2462,7 +2459,7 @@
 	jz	L$xts_dec_ret
 L$xts_dec_done2:
 	movq	%r9,%rdx
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movl	%r10d,%eax
 
 	movups	(%rdi),%xmm2
@@ -2492,7 +2489,7 @@
 	jnz	L$xts_dec_steal
 
 	subq	%r9,%rsi
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movl	%r10d,%eax
 
 	movups	(%rsi),%xmm2
@@ -2535,11 +2532,827 @@
 	movaps	%xmm0,96(%rsp)
 	pxor	%xmm14,%xmm14
 	pxor	%xmm15,%xmm15
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 L$xts_dec_epilogue:
 	.byte	0xf3,0xc3
 
+.globl	_aesni_ocb_encrypt
+.private_extern _aesni_ocb_encrypt
+
+.p2align	5
+_aesni_ocb_encrypt:
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	movq	8(%rax),%rbx
+	movq	8+8(%rax),%rbp
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	shll	$4,%r10d
+	movups	(%rcx),%xmm9
+	movups	16(%rcx,%r10,1),%xmm1
+
+	movdqu	(%r9),%xmm15
+	pxor	%xmm1,%xmm9
+	pxor	%xmm1,%xmm15
+
+	movl	$16+32,%eax
+	leaq	32(%r11,%r10,1),%rcx
+	movups	16(%r11),%xmm1
+	subq	%r10,%rax
+	movq	%rax,%r10
+
+	movdqu	(%rbx),%xmm10
+	movdqu	(%rbp),%xmm8
+
+	testq	$1,%r8
+	jnz	L$ocb_enc_odd
+
+	bsfq	%r8,%r12
+	addq	$1,%r8
+	shlq	$4,%r12
+	movdqu	(%rbx,%r12,1),%xmm7
+	movdqu	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+
+	call	__ocb_encrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,(%rsi)
+	leaq	16(%rsi),%rsi
+	subq	$1,%rdx
+	jz	L$ocb_enc_done
+
+L$ocb_enc_odd:
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	leaq	6(%r8),%r8
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+	shlq	$4,%r12
+	shlq	$4,%r13
+	shlq	$4,%r14
+
+	subq	$6,%rdx
+	jc	L$ocb_enc_short
+	jmp	L$ocb_enc_grandloop
+
+.p2align	5
+L$ocb_enc_grandloop:
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+
+	call	__ocb_encrypt6
+
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+	movups	%xmm7,80(%rsi)
+	leaq	96(%rsi),%rsi
+	subq	$6,%rdx
+	jnc	L$ocb_enc_grandloop
+
+L$ocb_enc_short:
+	addq	$6,%rdx
+	jz	L$ocb_enc_done
+
+	movdqu	0(%rdi),%xmm2
+	cmpq	$2,%rdx
+	jb	L$ocb_enc_one
+	movdqu	16(%rdi),%xmm3
+	je	L$ocb_enc_two
+
+	movdqu	32(%rdi),%xmm4
+	cmpq	$4,%rdx
+	jb	L$ocb_enc_three
+	movdqu	48(%rdi),%xmm5
+	je	L$ocb_enc_four
+
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm7,%xmm7
+
+	call	__ocb_encrypt6
+
+	movdqa	%xmm14,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+	movups	%xmm6,64(%rsi)
+
+	jmp	L$ocb_enc_done
+
+.p2align	4
+L$ocb_enc_one:
+	movdqa	%xmm10,%xmm7
+
+	call	__ocb_encrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,0(%rsi)
+	jmp	L$ocb_enc_done
+
+.p2align	4
+L$ocb_enc_two:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_encrypt4
+
+	movdqa	%xmm11,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+
+	jmp	L$ocb_enc_done
+
+.p2align	4
+L$ocb_enc_three:
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_encrypt4
+
+	movdqa	%xmm12,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+
+	jmp	L$ocb_enc_done
+
+.p2align	4
+L$ocb_enc_four:
+	call	__ocb_encrypt4
+
+	movdqa	%xmm13,%xmm15
+	movups	%xmm2,0(%rsi)
+	movups	%xmm3,16(%rsi)
+	movups	%xmm4,32(%rsi)
+	movups	%xmm5,48(%rsi)
+
+L$ocb_enc_done:
+	pxor	%xmm0,%xmm15
+	movdqu	%xmm8,(%rbp)
+	movdqu	%xmm15,(%r9)
+
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	leaq	40(%rsp),%rax
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+L$ocb_enc_epilogue:
+	.byte	0xf3,0xc3
+
+
+
+.p2align	5
+__ocb_encrypt6:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	movdqa	%xmm10,%xmm14
+	pxor	%xmm15,%xmm10
+	movdqu	(%rbx,%r14,1),%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm2,%xmm8
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm3,%xmm8
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm4,%xmm8
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm14
+	pxor	%xmm5,%xmm8
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm15
+	pxor	%xmm6,%xmm8
+	pxor	%xmm14,%xmm6
+	pxor	%xmm7,%xmm8
+	pxor	%xmm15,%xmm7
+	movups	32(%r11),%xmm0
+
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	addq	$6,%r8
+	pxor	%xmm9,%xmm10
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+.byte	102,15,56,220,241
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm14
+.byte	102,15,56,220,249
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm15
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	64(%r11),%xmm0
+	shlq	$4,%r12
+	shlq	$4,%r13
+	jmp	L$ocb_enc_loop6
+
+.p2align	5
+L$ocb_enc_loop6:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+.byte	102,15,56,220,240
+.byte	102,15,56,220,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$ocb_enc_loop6
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+.byte	102,15,56,220,241
+.byte	102,15,56,220,249
+	movups	16(%r11),%xmm1
+	shlq	$4,%r14
+
+.byte	102,65,15,56,221,210
+	movdqu	(%rbx),%xmm10
+	movq	%r10,%rax
+.byte	102,65,15,56,221,219
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+.byte	102,65,15,56,221,246
+.byte	102,65,15,56,221,255
+	.byte	0xf3,0xc3
+
+
+
+.p2align	5
+__ocb_encrypt4:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	pxor	%xmm15,%xmm10
+	pxor	%xmm10,%xmm11
+	pxor	%xmm2,%xmm8
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm3,%xmm8
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm4,%xmm8
+	pxor	%xmm12,%xmm4
+	pxor	%xmm5,%xmm8
+	pxor	%xmm13,%xmm5
+	movups	32(%r11),%xmm0
+
+	pxor	%xmm9,%xmm10
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+	pxor	%xmm9,%xmm13
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	48(%r11),%xmm1
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	64(%r11),%xmm0
+	jmp	L$ocb_enc_loop4
+
+.p2align	5
+L$ocb_enc_loop4:
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,220,208
+.byte	102,15,56,220,216
+.byte	102,15,56,220,224
+.byte	102,15,56,220,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$ocb_enc_loop4
+
+.byte	102,15,56,220,209
+.byte	102,15,56,220,217
+.byte	102,15,56,220,225
+.byte	102,15,56,220,233
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,65,15,56,221,210
+.byte	102,65,15,56,221,219
+.byte	102,65,15,56,221,228
+.byte	102,65,15,56,221,237
+	.byte	0xf3,0xc3
+
+
+
+.p2align	5
+__ocb_encrypt1:
+	pxor	%xmm15,%xmm7
+	pxor	%xmm9,%xmm7
+	pxor	%xmm2,%xmm8
+	pxor	%xmm7,%xmm2
+	movups	32(%r11),%xmm0
+
+.byte	102,15,56,220,209
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm7
+
+.byte	102,15,56,220,208
+	movups	64(%r11),%xmm0
+	jmp	L$ocb_enc_loop1
+
+.p2align	5
+L$ocb_enc_loop1:
+.byte	102,15,56,220,209
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,220,208
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$ocb_enc_loop1
+
+.byte	102,15,56,220,209
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,15,56,221,215
+	.byte	0xf3,0xc3
+
+
+.globl	_aesni_ocb_decrypt
+.private_extern _aesni_ocb_decrypt
+
+.p2align	5
+_aesni_ocb_decrypt:
+	leaq	(%rsp),%rax
+	pushq	%rbx
+	pushq	%rbp
+	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	movq	8(%rax),%rbx
+	movq	8+8(%rax),%rbp
+
+	movl	240(%rcx),%r10d
+	movq	%rcx,%r11
+	shll	$4,%r10d
+	movups	(%rcx),%xmm9
+	movups	16(%rcx,%r10,1),%xmm1
+
+	movdqu	(%r9),%xmm15
+	pxor	%xmm1,%xmm9
+	pxor	%xmm1,%xmm15
+
+	movl	$16+32,%eax
+	leaq	32(%r11,%r10,1),%rcx
+	movups	16(%r11),%xmm1
+	subq	%r10,%rax
+	movq	%rax,%r10
+
+	movdqu	(%rbx),%xmm10
+	movdqu	(%rbp),%xmm8
+
+	testq	$1,%r8
+	jnz	L$ocb_dec_odd
+
+	bsfq	%r8,%r12
+	addq	$1,%r8
+	shlq	$4,%r12
+	movdqu	(%rbx,%r12,1),%xmm7
+	movdqu	(%rdi),%xmm2
+	leaq	16(%rdi),%rdi
+
+	call	__ocb_decrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,(%rsi)
+	xorps	%xmm2,%xmm8
+	leaq	16(%rsi),%rsi
+	subq	$1,%rdx
+	jz	L$ocb_dec_done
+
+L$ocb_dec_odd:
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	leaq	6(%r8),%r8
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+	shlq	$4,%r12
+	shlq	$4,%r13
+	shlq	$4,%r14
+
+	subq	$6,%rdx
+	jc	L$ocb_dec_short
+	jmp	L$ocb_dec_grandloop
+
+.p2align	5
+L$ocb_dec_grandloop:
+	movdqu	0(%rdi),%xmm2
+	movdqu	16(%rdi),%xmm3
+	movdqu	32(%rdi),%xmm4
+	movdqu	48(%rdi),%xmm5
+	movdqu	64(%rdi),%xmm6
+	movdqu	80(%rdi),%xmm7
+	leaq	96(%rdi),%rdi
+
+	call	__ocb_decrypt6
+
+	movups	%xmm2,0(%rsi)
+	pxor	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm8
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm8
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm8
+	movups	%xmm7,80(%rsi)
+	pxor	%xmm7,%xmm8
+	leaq	96(%rsi),%rsi
+	subq	$6,%rdx
+	jnc	L$ocb_dec_grandloop
+
+L$ocb_dec_short:
+	addq	$6,%rdx
+	jz	L$ocb_dec_done
+
+	movdqu	0(%rdi),%xmm2
+	cmpq	$2,%rdx
+	jb	L$ocb_dec_one
+	movdqu	16(%rdi),%xmm3
+	je	L$ocb_dec_two
+
+	movdqu	32(%rdi),%xmm4
+	cmpq	$4,%rdx
+	jb	L$ocb_dec_three
+	movdqu	48(%rdi),%xmm5
+	je	L$ocb_dec_four
+
+	movdqu	64(%rdi),%xmm6
+	pxor	%xmm7,%xmm7
+
+	call	__ocb_decrypt6
+
+	movdqa	%xmm14,%xmm15
+	movups	%xmm2,0(%rsi)
+	pxor	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm8
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm8
+	movups	%xmm6,64(%rsi)
+	pxor	%xmm6,%xmm8
+
+	jmp	L$ocb_dec_done
+
+.p2align	4
+L$ocb_dec_one:
+	movdqa	%xmm10,%xmm7
+
+	call	__ocb_decrypt1
+
+	movdqa	%xmm7,%xmm15
+	movups	%xmm2,0(%rsi)
+	xorps	%xmm2,%xmm8
+	jmp	L$ocb_dec_done
+
+.p2align	4
+L$ocb_dec_two:
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_decrypt4
+
+	movdqa	%xmm11,%xmm15
+	movups	%xmm2,0(%rsi)
+	xorps	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm3,%xmm8
+
+	jmp	L$ocb_dec_done
+
+.p2align	4
+L$ocb_dec_three:
+	pxor	%xmm5,%xmm5
+
+	call	__ocb_decrypt4
+
+	movdqa	%xmm12,%xmm15
+	movups	%xmm2,0(%rsi)
+	xorps	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	xorps	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	xorps	%xmm4,%xmm8
+
+	jmp	L$ocb_dec_done
+
+.p2align	4
+L$ocb_dec_four:
+	call	__ocb_decrypt4
+
+	movdqa	%xmm13,%xmm15
+	movups	%xmm2,0(%rsi)
+	pxor	%xmm2,%xmm8
+	movups	%xmm3,16(%rsi)
+	pxor	%xmm3,%xmm8
+	movups	%xmm4,32(%rsi)
+	pxor	%xmm4,%xmm8
+	movups	%xmm5,48(%rsi)
+	pxor	%xmm5,%xmm8
+
+L$ocb_dec_done:
+	pxor	%xmm0,%xmm15
+	movdqu	%xmm8,(%rbp)
+	movdqu	%xmm15,(%r9)
+
+	xorps	%xmm0,%xmm0
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	leaq	40(%rsp),%rax
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbp
+	movq	-8(%rax),%rbx
+	leaq	(%rax),%rsp
+L$ocb_dec_epilogue:
+	.byte	0xf3,0xc3
+
+
+
+.p2align	5
+__ocb_decrypt6:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	movdqa	%xmm10,%xmm14
+	pxor	%xmm15,%xmm10
+	movdqu	(%rbx,%r14,1),%xmm15
+	pxor	%xmm10,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm14
+	pxor	%xmm13,%xmm5
+	pxor	%xmm14,%xmm15
+	pxor	%xmm14,%xmm6
+	pxor	%xmm15,%xmm7
+	movups	32(%r11),%xmm0
+
+	leaq	1(%r8),%r12
+	leaq	3(%r8),%r13
+	leaq	5(%r8),%r14
+	addq	$6,%r8
+	pxor	%xmm9,%xmm10
+	bsfq	%r12,%r12
+	bsfq	%r13,%r13
+	bsfq	%r14,%r14
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+.byte	102,15,56,222,241
+	pxor	%xmm9,%xmm13
+	pxor	%xmm9,%xmm14
+.byte	102,15,56,222,249
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm15
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	64(%r11),%xmm0
+	shlq	$4,%r12
+	shlq	$4,%r13
+	jmp	L$ocb_dec_loop6
+
+.p2align	5
+L$ocb_dec_loop6:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+.byte	102,15,56,222,240
+.byte	102,15,56,222,248
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$ocb_dec_loop6
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+.byte	102,15,56,222,241
+.byte	102,15,56,222,249
+	movups	16(%r11),%xmm1
+	shlq	$4,%r14
+
+.byte	102,65,15,56,223,210
+	movdqu	(%rbx),%xmm10
+	movq	%r10,%rax
+.byte	102,65,15,56,223,219
+.byte	102,65,15,56,223,228
+.byte	102,65,15,56,223,237
+.byte	102,65,15,56,223,246
+.byte	102,65,15,56,223,255
+	.byte	0xf3,0xc3
+
+
+
+.p2align	5
+__ocb_decrypt4:
+	pxor	%xmm9,%xmm15
+	movdqu	(%rbx,%r12,1),%xmm11
+	movdqa	%xmm10,%xmm12
+	movdqu	(%rbx,%r13,1),%xmm13
+	pxor	%xmm15,%xmm10
+	pxor	%xmm10,%xmm11
+	pxor	%xmm10,%xmm2
+	pxor	%xmm11,%xmm12
+	pxor	%xmm11,%xmm3
+	pxor	%xmm12,%xmm13
+	pxor	%xmm12,%xmm4
+	pxor	%xmm13,%xmm5
+	movups	32(%r11),%xmm0
+
+	pxor	%xmm9,%xmm10
+	pxor	%xmm9,%xmm11
+	pxor	%xmm9,%xmm12
+	pxor	%xmm9,%xmm13
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	48(%r11),%xmm1
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	64(%r11),%xmm0
+	jmp	L$ocb_dec_loop4
+
+.p2align	5
+L$ocb_dec_loop4:
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,222,208
+.byte	102,15,56,222,216
+.byte	102,15,56,222,224
+.byte	102,15,56,222,232
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$ocb_dec_loop4
+
+.byte	102,15,56,222,209
+.byte	102,15,56,222,217
+.byte	102,15,56,222,225
+.byte	102,15,56,222,233
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,65,15,56,223,210
+.byte	102,65,15,56,223,219
+.byte	102,65,15,56,223,228
+.byte	102,65,15,56,223,237
+	.byte	0xf3,0xc3
+
+
+
+.p2align	5
+__ocb_decrypt1:
+	pxor	%xmm15,%xmm7
+	pxor	%xmm9,%xmm7
+	pxor	%xmm7,%xmm2
+	movups	32(%r11),%xmm0
+
+.byte	102,15,56,222,209
+	movups	48(%r11),%xmm1
+	pxor	%xmm9,%xmm7
+
+.byte	102,15,56,222,208
+	movups	64(%r11),%xmm0
+	jmp	L$ocb_dec_loop1
+
+.p2align	5
+L$ocb_dec_loop1:
+.byte	102,15,56,222,209
+	movups	(%rcx,%rax,1),%xmm1
+	addq	$32,%rax
+
+.byte	102,15,56,222,208
+	movups	-16(%rcx,%rax,1),%xmm0
+	jnz	L$ocb_dec_loop1
+
+.byte	102,15,56,222,209
+	movups	16(%r11),%xmm1
+	movq	%r10,%rax
+
+.byte	102,15,56,223,215
+	.byte	0xf3,0xc3
+
 .globl	_aesni_cbc_encrypt
 .private_extern _aesni_cbc_encrypt
 
@@ -2637,11 +3450,11 @@
 	jmp	L$cbc_ret
 .p2align	4
 L$cbc_decrypt_bulk:
-	leaq	(%rsp),%rax
+	leaq	(%rsp),%r11
 	pushq	%rbp
 	subq	$16,%rsp
 	andq	$-16,%rsp
-	leaq	-8(%rax),%rbp
+	movq	%rcx,%rbp
 	movups	(%r8),%xmm10
 	movl	%r10d,%eax
 	cmpq	$0x50,%rdx
@@ -2681,7 +3494,7 @@
 	pxor	%xmm0,%xmm3
 	movups	16-112(%rcx),%xmm1
 	pxor	%xmm0,%xmm4
-	xorq	%r11,%r11
+	movq	$-1,%rbp
 	cmpq	$0x70,%rdx
 	pxor	%xmm0,%xmm5
 	pxor	%xmm0,%xmm6
@@ -2697,10 +3510,10 @@
 .byte	102,15,56,222,241
 .byte	102,15,56,222,249
 .byte	102,68,15,56,222,193
-	setnc	%r11b
-	shlq	$7,%r11
+	adcq	$0,%rbp
+	andq	$128,%rbp
 .byte	102,68,15,56,222,201
-	addq	%rdi,%r11
+	addq	%rdi,%rbp
 	movups	48-112(%rcx),%xmm1
 .byte	102,15,56,222,208
 .byte	102,15,56,222,216
@@ -2838,18 +3651,18 @@
 	movdqu	112(%rdi),%xmm0
 .byte	102,65,15,56,223,228
 	leaq	128(%rdi),%rdi
-	movdqu	0(%r11),%xmm11
+	movdqu	0(%rbp),%xmm11
 .byte	102,65,15,56,223,237
 .byte	102,65,15,56,223,246
-	movdqu	16(%r11),%xmm12
-	movdqu	32(%r11),%xmm13
+	movdqu	16(%rbp),%xmm12
+	movdqu	32(%rbp),%xmm13
 .byte	102,65,15,56,223,255
 .byte	102,68,15,56,223,193
-	movdqu	48(%r11),%xmm14
-	movdqu	64(%r11),%xmm15
+	movdqu	48(%rbp),%xmm14
+	movdqu	64(%rbp),%xmm15
 .byte	102,69,15,56,223,202
 	movdqa	%xmm0,%xmm10
-	movdqu	80(%r11),%xmm1
+	movdqu	80(%rbp),%xmm1
 	movups	-112(%rcx),%xmm0
 
 	movups	%xmm2,(%rsi)
@@ -2968,7 +3781,7 @@
 	pxor	%xmm13,%xmm5
 	movdqu	%xmm4,32(%rsi)
 	pxor	%xmm14,%xmm6
-	movq	%r11,%rcx
+	movq	%rbp,%rcx
 	movdqu	%xmm5,48(%rsi)
 	pxor	%xmm15,%xmm7
 	movl	%r10d,%eax
@@ -3121,8 +3934,8 @@
 L$cbc_dec_ret:
 	xorps	%xmm0,%xmm0
 	pxor	%xmm1,%xmm1
-	leaq	(%rbp),%rsp
-	popq	%rbp
+	movq	-8(%r11),%rbp
+	leaq	(%r11),%rsp
 L$cbc_ret:
 	.byte	0xf3,0xc3
 
diff --git a/mac-x86_64/crypto/aes/bsaes-x86_64.S b/mac-x86_64/crypto/aes/bsaes-x86_64.S
index ad802e3..6e679c1 100644
--- a/mac-x86_64/crypto/aes/bsaes-x86_64.S
+++ b/mac-x86_64/crypto/aes/bsaes-x86_64.S
@@ -1302,15 +1302,14 @@
 	cmpq	%rax,%rbp
 	ja	L$cbc_dec_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 L$cbc_dec_epilogue:
 	.byte	0xf3,0xc3
 
@@ -1503,15 +1502,14 @@
 	cmpq	%rax,%rbp
 	ja	L$ctr_enc_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 L$ctr_enc_epilogue:
 	.byte	0xf3,0xc3
 
@@ -1955,15 +1953,14 @@
 	cmpq	%rax,%rbp
 	ja	L$xts_enc_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 L$xts_enc_epilogue:
 	.byte	0xf3,0xc3
 
@@ -2434,15 +2431,14 @@
 	cmpq	%rax,%rbp
 	ja	L$xts_dec_bzero
 
-	leaq	(%rbp),%rsp
-	movq	72(%rsp),%r15
-	movq	80(%rsp),%r14
-	movq	88(%rsp),%r13
-	movq	96(%rsp),%r12
-	movq	104(%rsp),%rbx
-	movq	112(%rsp),%rax
-	leaq	120(%rsp),%rsp
-	movq	%rax,%rbp
+	leaq	120(%rbp),%rax
+	movq	-48(%rax),%r15
+	movq	-40(%rax),%r14
+	movq	-32(%rax),%r13
+	movq	-24(%rax),%r12
+	movq	-16(%rax),%rbx
+	movq	-8(%rax),%rbp
+	leaq	(%rax),%rsp
 L$xts_dec_epilogue:
 	.byte	0xf3,0xc3
 
diff --git a/mac-x86_64/crypto/bn/x86_64-mont.S b/mac-x86_64/crypto/bn/x86_64-mont.S
index 51e5d19..41a0926 100644
--- a/mac-x86_64/crypto/bn/x86_64-mont.S
+++ b/mac-x86_64/crypto/bn/x86_64-mont.S
@@ -8,6 +8,10 @@
 
 .p2align	4
 _bn_mul_mont:
+
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
 	testl	$3,%r9d
 	jnz	L$mul_enter
 	cmpl	$8,%r9d
@@ -21,20 +25,50 @@
 .p2align	4
 L$mul_enter:
 	pushq	%rbx
+
 	pushq	%rbp
+
 	pushq	%r12
+
 	pushq	%r13
+
 	pushq	%r14
+
 	pushq	%r15
 
-	movl	%r9d,%r9d
-	leaq	2(%r9),%r10
-	movq	%rsp,%r11
-	negq	%r10
-	leaq	(%rsp,%r10,8),%rsp
-	andq	$-1024,%rsp
 
-	movq	%r11,8(%rsp,%r9,8)
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-16(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+	jmp	L$mul_page_walk_done
+
+.p2align	4
+L$mul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+L$mul_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+
 L$mul_body:
 	movq	%rdx,%r12
 	movq	(%r8),%r8
@@ -186,51 +220,86 @@
 
 	sbbq	$0,%rax
 	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
 	movq	%r9,%r15
+	orq	%rcx,%rsi
 .p2align	4
 L$copy:
-	movq	(%rsp,%r14,8),%rsi
-	movq	(%rdi,%r14,8),%rcx
-	xorq	%rcx,%rsi
-	andq	%rax,%rsi
-	xorq	%rcx,%rsi
+	movq	(%rsi,%r14,8),%rax
 	movq	%r14,(%rsp,%r14,8)
-	movq	%rsi,(%rdi,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
 	leaq	1(%r14),%r14
 	subq	$1,%r15
 	jnz	L$copy
 
 	movq	8(%rsp,%r9,8),%rsi
+
 	movq	$1,%rax
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
 L$mul_epilogue:
 	.byte	0xf3,0xc3
 
 
+
 .p2align	4
 bn_mul4x_mont:
-L$mul4x_enter:
-	pushq	%rbx
-	pushq	%rbp
-	pushq	%r12
-	pushq	%r13
-	pushq	%r14
-	pushq	%r15
 
 	movl	%r9d,%r9d
-	leaq	4(%r9),%r10
-	movq	%rsp,%r11
-	negq	%r10
-	leaq	(%rsp,%r10,8),%rsp
-	andq	$-1024,%rsp
+	movq	%rsp,%rax
 
-	movq	%r11,8(%rsp,%r9,8)
+L$mul4x_enter:
+	pushq	%rbx
+
+	pushq	%rbp
+
+	pushq	%r12
+
+	pushq	%r13
+
+	pushq	%r14
+
+	pushq	%r15
+
+
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-32(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul4x_page_walk
+	jmp	L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
+	movq	%rax,8(%rsp,%r9,8)
+
 L$mul4x_body:
 	movq	%rdi,16(%rsp,%r9,8)
 	movq	%rdx,%r12
@@ -530,9 +599,11 @@
 	cmpq	%r9,%r14
 	jb	L$outer4x
 	movq	16(%rsp,%r9,8),%rdi
+	leaq	-4(%r9),%r15
 	movq	0(%rsp),%rax
+	pxor	%xmm0,%xmm0
 	movq	8(%rsp),%rdx
-	shrq	$2,%r9
+	shrq	$2,%r15
 	leaq	(%rsp),%rsi
 	xorq	%r14,%r14
 
@@ -540,7 +611,6 @@
 	movq	16(%rsi),%rbx
 	movq	24(%rsi),%rbp
 	sbbq	8(%rcx),%rdx
-	leaq	-1(%r9),%r15
 	jmp	L$sub4x
 .p2align	4
 L$sub4x:
@@ -568,62 +638,79 @@
 	movq	%rbx,16(%rdi,%r14,8)
 
 	sbbq	$0,%rax
-	movq	%rax,%xmm0
-	punpcklqdq	%xmm0,%xmm0
 	movq	%rbp,24(%rdi,%r14,8)
 	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
+	leaq	-4(%r9),%r15
+	orq	%rcx,%rsi
+	shrq	$2,%r15
 
-	movq	%r9,%r15
-	pxor	%xmm5,%xmm5
+	movdqu	(%rsi),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,(%rdi)
 	jmp	L$copy4x
 .p2align	4
 L$copy4x:
-	movdqu	(%rsp,%r14,1),%xmm2
-	movdqu	16(%rsp,%r14,1),%xmm4
-	movdqu	(%rdi,%r14,1),%xmm1
-	movdqu	16(%rdi,%r14,1),%xmm3
-	pxor	%xmm1,%xmm2
-	pxor	%xmm3,%xmm4
-	pand	%xmm0,%xmm2
-	pand	%xmm0,%xmm4
-	pxor	%xmm1,%xmm2
-	pxor	%xmm3,%xmm4
-	movdqu	%xmm2,(%rdi,%r14,1)
-	movdqu	%xmm4,16(%rdi,%r14,1)
-	movdqa	%xmm5,(%rsp,%r14,1)
-	movdqa	%xmm5,16(%rsp,%r14,1)
-
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqu	32(%rsi,%r14,1),%xmm1
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
+	movdqa	%xmm0,32(%rsp,%r14,1)
+	movdqu	%xmm1,32(%rdi,%r14,1)
 	leaq	32(%r14),%r14
 	decq	%r15
 	jnz	L$copy4x
 
-	shlq	$2,%r9
+	movdqu	16(%rsi,%r14,1),%xmm2
+	movdqa	%xmm0,16(%rsp,%r14,1)
+	movdqu	%xmm2,16(%rdi,%r14,1)
 	movq	8(%rsp,%r9,8),%rsi
+
 	movq	$1,%rax
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+
+	movq	-40(%rsi),%r14
+
+	movq	-32(%rsi),%r13
+
+	movq	-24(%rsi),%r12
+
+	movq	-16(%rsi),%rbp
+
+	movq	-8(%rsi),%rbx
+
+	leaq	(%rsi),%rsp
+
 L$mul4x_epilogue:
 	.byte	0xf3,0xc3
 
 
 
 
+
 .p2align	5
 bn_sqr8x_mont:
-L$sqr8x_enter:
+
 	movq	%rsp,%rax
+
+L$sqr8x_enter:
 	pushq	%rbx
+
 	pushq	%rbp
+
 	pushq	%r12
+
 	pushq	%r13
+
 	pushq	%r14
+
 	pushq	%r15
 
+L$sqr8x_prologue:
+
 	movl	%r9d,%r10d
 	shll	$3,%r9d
 	shlq	$3+2,%r10
@@ -635,30 +722,49 @@
 
 
 	leaq	-64(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	movq	(%r8),%r8
 	subq	%rsi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	L$sqr8x_sp_alt
-	subq	%r11,%rsp
-	leaq	-64(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-64(%rbp,%r9,2),%rbp
 	jmp	L$sqr8x_sp_done
 
 .p2align	5
 L$sqr8x_sp_alt:
 	leaq	4096-64(,%r9,2),%r10
-	leaq	-64(%rsp,%r9,2),%rsp
+	leaq	-64(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 L$sqr8x_sp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$sqr8x_page_walk
+	jmp	L$sqr8x_page_walk_done
+
+.p2align	4
+L$sqr8x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$sqr8x_page_walk
+L$sqr8x_page_walk_done:
+
 	movq	%r9,%r10
 	negq	%r9
 
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
+
 L$sqr8x_body:
 
 .byte	102,72,15,110,209
@@ -705,6 +811,7 @@
 	pxor	%xmm0,%xmm0
 	pshufd	$0,%xmm1,%xmm1
 	movq	40(%rsp),%rsi
+
 	jmp	L$sqr8x_cond_copy
 
 .p2align	5
@@ -734,15 +841,23 @@
 
 	movq	$1,%rax
 	movq	-48(%rsi),%r15
+
 	movq	-40(%rsi),%r14
+
 	movq	-32(%rsi),%r13
+
 	movq	-24(%rsi),%r12
+
 	movq	-16(%rsi),%rbp
+
 	movq	-8(%rsi),%rbx
+
 	leaq	(%rsi),%rsp
+
 L$sqr8x_epilogue:
 	.byte	0xf3,0xc3
 
+
 .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .p2align	4
 #endif
diff --git a/mac-x86_64/crypto/bn/x86_64-mont5.S b/mac-x86_64/crypto/bn/x86_64-mont5.S
index a154cc8..24b56de 100644
--- a/mac-x86_64/crypto/bn/x86_64-mont5.S
+++ b/mac-x86_64/crypto/bn/x86_64-mont5.S
@@ -8,30 +8,64 @@
 
 .p2align	6
 _bn_mul_mont_gather5:
+
+	movl	%r9d,%r9d
+	movq	%rsp,%rax
+
 	testl	$7,%r9d
 	jnz	L$mul_enter
 	jmp	L$mul4x_enter
 
 .p2align	4
 L$mul_enter:
-	movl	%r9d,%r9d
-	movq	%rsp,%rax
 	movd	8(%rsp),%xmm5
-	leaq	L$inc(%rip),%r10
 	pushq	%rbx
+
 	pushq	%rbp
+
 	pushq	%r12
+
 	pushq	%r13
+
 	pushq	%r14
+
 	pushq	%r15
 
-	leaq	2(%r9),%r11
-	negq	%r11
-	leaq	-264(%rsp,%r11,8),%rsp
-	andq	$-1024,%rsp
 
+	negq	%r9
+	movq	%rsp,%r11
+	leaq	-280(%rsp,%r9,8),%r10
+	negq	%r9
+	andq	$-1024,%r10
+
+
+
+
+
+
+
+
+
+	subq	%r10,%r11
+	andq	$-4096,%r11
+	leaq	(%r10,%r11,1),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+	jmp	L$mul_page_walk_done
+
+L$mul_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r11
+	cmpq	%r10,%rsp
+	ja	L$mul_page_walk
+L$mul_page_walk_done:
+
+	leaq	L$inc(%rip),%r10
 	movq	%rax,8(%rsp,%r9,8)
+
 L$mul_body:
+
 	leaq	128(%rdx),%r12
 	movdqa	0(%r10),%xmm0
 	movdqa	16(%r10),%xmm1
@@ -370,46 +404,65 @@
 
 	sbbq	$0,%rax
 	xorq	%r14,%r14
+	andq	%rax,%rsi
+	notq	%rax
+	movq	%rdi,%rcx
+	andq	%rax,%rcx
 	movq	%r9,%r15
+	orq	%rcx,%rsi
 .p2align	4
 L$copy:
-	movq	(%rsp,%r14,8),%rsi
-	movq	(%rdi,%r14,8),%rcx
-	xorq	%rcx,%rsi
-	andq	%rax,%rsi
-	xorq	%rcx,%rsi
+	movq	(%rsi,%r14,8),%rax
 	movq	%r14,(%rsp,%r14,8)
-	movq	%rsi,(%rdi,%r14,8)
+	movq	%rax,(%rdi,%r14,8)
 	leaq	1(%r14),%r14
 	subq	$1,%r15
 	jnz	L$copy
 
 	movq	8(%rsp,%r9,8),%rsi
+
 	movq	$1,%rax
 
 	movq	-48(%rsi),%r15
+
 	movq	-40(%rsi),%r14
+
 	movq	-32(%rsi),%r13
+
 	movq	-24(%rsi),%r12
+
 	movq	-16(%rsi),%rbp
+
 	movq	-8(%rsi),%rbx
+
 	leaq	(%rsi),%rsp
+
 L$mul_epilogue:
 	.byte	0xf3,0xc3
 
 
+
 .p2align	5
 bn_mul4x_mont_gather5:
-L$mul4x_enter:
+
 .byte	0x67
 	movq	%rsp,%rax
+
+L$mul4x_enter:
 	pushq	%rbx
+
 	pushq	%rbp
+
 	pushq	%r12
+
 	pushq	%r13
+
 	pushq	%r14
+
 	pushq	%r15
 
+L$mul4x_prologue:
+
 .byte	0x67
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
@@ -425,46 +478,73 @@
 
 
 	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	subq	%rdi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	L$mul4xsp_alt
-	subq	%r11,%rsp
-	leaq	-320(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
 	jmp	L$mul4xsp_done
 
 .p2align	5
 L$mul4xsp_alt:
 	leaq	4096-320(,%r9,2),%r10
-	leaq	-320(%rsp,%r9,2),%rsp
+	leaq	-320(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 L$mul4xsp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mul4x_page_walk
+	jmp	L$mul4x_page_walk_done
+
+L$mul4x_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$mul4x_page_walk
+L$mul4x_page_walk_done:
+
 	negq	%r9
 
 	movq	%rax,40(%rsp)
+
 L$mul4x_body:
 
 	call	mul4x_internal
 
 	movq	40(%rsp),%rsi
+
 	movq	$1,%rax
 
 	movq	-48(%rsi),%r15
+
 	movq	-40(%rsi),%r14
+
 	movq	-32(%rsi),%r13
+
 	movq	-24(%rsi),%r12
+
 	movq	-16(%rsi),%rbp
+
 	movq	-8(%rsi),%rbx
+
 	leaq	(%rsi),%rsp
+
 L$mul4x_epilogue:
 	.byte	0xf3,0xc3
 
 
 
+
 .p2align	5
 mul4x_internal:
 	shlq	$5,%r9
@@ -994,14 +1074,23 @@
 
 .p2align	5
 _bn_power5:
+
 	movq	%rsp,%rax
+
 	pushq	%rbx
+
 	pushq	%rbp
+
 	pushq	%r12
+
 	pushq	%r13
+
 	pushq	%r14
+
 	pushq	%r15
 
+L$power5_prologue:
+
 	shll	$3,%r9d
 	leal	(%r9,%r9,2),%r10d
 	negq	%r9
@@ -1015,24 +1104,41 @@
 
 
 	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	subq	%rdi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	L$pwr_sp_alt
-	subq	%r11,%rsp
-	leaq	-320(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
 	jmp	L$pwr_sp_done
 
 .p2align	5
 L$pwr_sp_alt:
 	leaq	4096-320(,%r9,2),%r10
-	leaq	-320(%rsp,%r9,2),%rsp
+	leaq	-320(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 L$pwr_sp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwr_page_walk
+	jmp	L$pwr_page_walk_done
+
+L$pwr_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$pwr_page_walk
+L$pwr_page_walk_done:
+
 	movq	%r9,%r10
 	negq	%r9
 
@@ -1047,6 +1153,7 @@
 
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
+
 L$power5_body:
 .byte	102,72,15,110,207
 .byte	102,72,15,110,209
@@ -1073,18 +1180,27 @@
 	call	mul4x_internal
 
 	movq	40(%rsp),%rsi
+
 	movq	$1,%rax
 	movq	-48(%rsi),%r15
+
 	movq	-40(%rsi),%r14
+
 	movq	-32(%rsi),%r13
+
 	movq	-24(%rsi),%r12
+
 	movq	-16(%rsi),%rbp
+
 	movq	-8(%rsi),%rbx
+
 	leaq	(%rsi),%rsp
+
 L$power5_epilogue:
 	.byte	0xf3,0xc3
 
 
+
 .globl	_bn_sqr8x_internal
 .private_extern _bn_sqr8x_internal
 .private_extern	_bn_sqr8x_internal
@@ -1935,15 +2051,24 @@
 
 .p2align	5
 bn_from_mont8x:
+
 .byte	0x67
 	movq	%rsp,%rax
+
 	pushq	%rbx
+
 	pushq	%rbp
+
 	pushq	%r12
+
 	pushq	%r13
+
 	pushq	%r14
+
 	pushq	%r15
 
+L$from_prologue:
+
 	shll	$3,%r9d
 	leaq	(%r9,%r9,2),%r10
 	negq	%r9
@@ -1957,24 +2082,41 @@
 
 
 	leaq	-320(%rsp,%r9,2),%r11
+	movq	%rsp,%rbp
 	subq	%rdi,%r11
 	andq	$4095,%r11
 	cmpq	%r11,%r10
 	jb	L$from_sp_alt
-	subq	%r11,%rsp
-	leaq	-320(%rsp,%r9,2),%rsp
+	subq	%r11,%rbp
+	leaq	-320(%rbp,%r9,2),%rbp
 	jmp	L$from_sp_done
 
 .p2align	5
 L$from_sp_alt:
 	leaq	4096-320(,%r9,2),%r10
-	leaq	-320(%rsp,%r9,2),%rsp
+	leaq	-320(%rbp,%r9,2),%rbp
 	subq	%r10,%r11
 	movq	$0,%r10
 	cmovcq	%r10,%r11
-	subq	%r11,%rsp
+	subq	%r11,%rbp
 L$from_sp_done:
-	andq	$-64,%rsp
+	andq	$-64,%rbp
+	movq	%rsp,%r11
+	subq	%rbp,%r11
+	andq	$-4096,%r11
+	leaq	(%r11,%rbp,1),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$from_page_walk
+	jmp	L$from_page_walk_done
+
+L$from_page_walk:
+	leaq	-4096(%rsp),%rsp
+	movq	(%rsp),%r10
+	cmpq	%rbp,%rsp
+	ja	L$from_page_walk
+L$from_page_walk_done:
+
 	movq	%r9,%r10
 	negq	%r9
 
@@ -1989,6 +2131,7 @@
 
 	movq	%r8,32(%rsp)
 	movq	%rax,40(%rsp)
+
 L$from_body:
 	movq	%r9,%r11
 	leaq	48(%rsp),%rax
@@ -2024,11 +2167,12 @@
 
 	pxor	%xmm0,%xmm0
 	leaq	48(%rsp),%rax
-	movq	40(%rsp),%rsi
 	jmp	L$from_mont_zero
 
 .p2align	5
 L$from_mont_zero:
+	movq	40(%rsp),%rsi
+
 	movdqa	%xmm0,0(%rax)
 	movdqa	%xmm0,16(%rax)
 	movdqa	%xmm0,32(%rax)
@@ -2039,15 +2183,23 @@
 
 	movq	$1,%rax
 	movq	-48(%rsi),%r15
+
 	movq	-40(%rsi),%r14
+
 	movq	-32(%rsi),%r13
+
 	movq	-24(%rsi),%r12
+
 	movq	-16(%rsi),%rbp
+
 	movq	-8(%rsi),%rbx
+
 	leaq	(%rsi),%rsp
+
 L$from_epilogue:
 	.byte	0xf3,0xc3
 
+
 .globl	_bn_scatter5
 .private_extern _bn_scatter5
 
diff --git a/mac-x86_64/crypto/chacha/chacha-x86_64.S b/mac-x86_64/crypto/chacha/chacha-x86_64.S
index c3554c8..51c0caa 100644
--- a/mac-x86_64/crypto/chacha/chacha-x86_64.S
+++ b/mac-x86_64/crypto/chacha/chacha-x86_64.S
@@ -22,6 +22,15 @@
 .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
 L$sigma:
 .byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.p2align	6
+L$zeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+L$fourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+L$incz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+L$sixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
 .byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .globl	_ChaCha20_ctr32
 .private_extern _ChaCha20_ctr32
@@ -41,6 +50,7 @@
 	pushq	%r14
 	pushq	%r15
 	subq	$64+24,%rsp
+L$ctr32_body:
 
 
 	movdqu	(%rcx),%xmm1
@@ -278,13 +288,14 @@
 	jnz	L$oop_tail
 
 L$done:
-	addq	$64+24,%rsp
-	popq	%r15
-	popq	%r14
-	popq	%r13
-	popq	%r12
-	popq	%rbp
-	popq	%rbx
+	leaq	64+24+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$no_data:
 	.byte	0xf3,0xc3
 
@@ -292,18 +303,12 @@
 .p2align	5
 ChaCha20_ssse3:
 L$ChaCha20_ssse3:
+	movq	%rsp,%r9
 	cmpq	$128,%rdx
 	ja	L$ChaCha20_4x
 
 L$do_sse3_after_all:
-	pushq	%rbx
-	pushq	%rbp
-	pushq	%r12
-	pushq	%r13
-	pushq	%r14
-	pushq	%r15
-
-	subq	$64+24,%rsp
+	subq	$64+8,%rsp
 	movdqa	L$sigma(%rip),%xmm0
 	movdqu	(%rcx),%xmm1
 	movdqu	16(%rcx),%xmm2
@@ -315,7 +320,7 @@
 	movdqa	%xmm1,16(%rsp)
 	movdqa	%xmm2,32(%rsp)
 	movdqa	%xmm3,48(%rsp)
-	movl	$10,%ebp
+	movq	$10,%r8
 	jmp	L$oop_ssse3
 
 .p2align	5
@@ -325,7 +330,7 @@
 	movdqa	16(%rsp),%xmm1
 	movdqa	32(%rsp),%xmm2
 	paddd	48(%rsp),%xmm3
-	movl	$10,%ebp
+	movq	$10,%r8
 	movdqa	%xmm3,48(%rsp)
 	jmp	L$oop_ssse3
 
@@ -374,7 +379,7 @@
 	pshufd	$78,%xmm2,%xmm2
 	pshufd	$147,%xmm1,%xmm1
 	pshufd	$57,%xmm3,%xmm3
-	decl	%ebp
+	decq	%r8
 	jnz	L$oop_ssse3
 	paddd	0(%rsp),%xmm0
 	paddd	16(%rsp),%xmm1
@@ -411,31 +416,27 @@
 	movdqa	%xmm1,16(%rsp)
 	movdqa	%xmm2,32(%rsp)
 	movdqa	%xmm3,48(%rsp)
-	xorq	%rbx,%rbx
+	xorq	%r8,%r8
 
 L$oop_tail_ssse3:
-	movzbl	(%rsi,%rbx,1),%eax
-	movzbl	(%rsp,%rbx,1),%ecx
-	leaq	1(%rbx),%rbx
+	movzbl	(%rsi,%r8,1),%eax
+	movzbl	(%rsp,%r8,1),%ecx
+	leaq	1(%r8),%r8
 	xorl	%ecx,%eax
-	movb	%al,-1(%rdi,%rbx,1)
+	movb	%al,-1(%rdi,%r8,1)
 	decq	%rdx
 	jnz	L$oop_tail_ssse3
 
 L$done_ssse3:
-	addq	$64+24,%rsp
-	popq	%r15
-	popq	%r14
-	popq	%r13
-	popq	%r12
-	popq	%rbp
-	popq	%rbx
+	leaq	(%r9),%rsp
+L$ssse3_epilogue:
 	.byte	0xf3,0xc3
 
 
 .p2align	5
 ChaCha20_4x:
 L$ChaCha20_4x:
+	movq	%rsp,%r9
 	movq	%r10,%r11
 	shrq	$32,%r10
 	testq	$32,%r10
@@ -448,8 +449,7 @@
 	je	L$do_sse3_after_all
 
 L$proceed4x:
-	leaq	-120(%rsp),%r11
-	subq	$0x148+0,%rsp
+	subq	$0x140+8,%rsp
 	movdqa	L$sigma(%rip),%xmm11
 	movdqu	(%rcx),%xmm15
 	movdqu	16(%rcx),%xmm7
@@ -976,18 +976,18 @@
 	jnz	L$oop_tail4x
 
 L$done4x:
-	addq	$0x148+0,%rsp
+	leaq	(%r9),%rsp
+L$4x_epilogue:
 	.byte	0xf3,0xc3
 
 
 .p2align	5
 ChaCha20_8x:
 L$ChaCha20_8x:
-	movq	%rsp,%r10
+	movq	%rsp,%r9
 	subq	$0x280+8,%rsp
 	andq	$-32,%rsp
 	vzeroupper
-	movq	%r10,640(%rsp)
 
 
 
@@ -1578,7 +1578,8 @@
 
 L$done8x:
 	vzeroall
-	movq	640(%rsp),%rsp
+	leaq	(%r9),%rsp
+L$8x_epilogue:
 	.byte	0xf3,0xc3
 
 #endif
diff --git a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
index 03cd872..62d114d 100644
--- a/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
+++ b/mac-x86_64/crypto/cipher/chacha20_poly1305_x86_64.S
@@ -44,7 +44,7 @@
 
 .p2align	6
 poly_hash_ad_internal:
-.cfi_startproc	
+
 	xorq	%r10,%r10
 	xorq	%r11,%r11
 	xorq	%r12,%r12
@@ -207,7 +207,7 @@
 
 1:
 	.byte	0xf3,0xc3
-.cfi_endproc	
+
 
 
 .globl	_chacha20_poly1305_open
@@ -215,31 +215,31 @@
 
 .p2align	6
 _chacha20_poly1305_open:
-.cfi_startproc	
+
 	pushq	%rbp
-.cfi_adjust_cfa_offset	8
+
 	pushq	%rbx
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r12
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r13
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r14
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r15
-.cfi_adjust_cfa_offset	8
+
 
 
 	pushq	%r9
-.cfi_adjust_cfa_offset	8
+
 	subq	$288 + 32,%rsp
-.cfi_adjust_cfa_offset	288 + 32
-.cfi_offset	rbp, -16
-.cfi_offset	rbx, -24
-.cfi_offset	r12, -32
-.cfi_offset	r13, -40
-.cfi_offset	r14, -48
-.cfi_offset	r15, -56
+
+
+
+
+
+
+
 	leaq	32(%rsp),%rbp
 	andq	$-32,%rbp
 	movq	%rdx,8+32(%rbp)
@@ -1834,26 +1834,26 @@
 	adcq	8+16(%rbp),%r11
 
 	addq	$288 + 32,%rsp
-.cfi_adjust_cfa_offset	-(288 + 32)
+
 	popq	%r9
-.cfi_adjust_cfa_offset	-8
+
 	movq	%r10,(%r9)
 	movq	%r11,8(%r9)
 
 	popq	%r15
-.cfi_adjust_cfa_offset	-8
+
 	popq	%r14
-.cfi_adjust_cfa_offset	-8
+
 	popq	%r13
-.cfi_adjust_cfa_offset	-8
+
 	popq	%r12
-.cfi_adjust_cfa_offset	-8
+
 	popq	%rbx
-.cfi_adjust_cfa_offset	-8
+
 	popq	%rbp
-.cfi_adjust_cfa_offset	-8
+
 	.byte	0xf3,0xc3
-.cfi_adjust_cfa_offset	(8 * 6) + 288 + 32
+
 
 open_sse_128:
 	movdqu	.chacha20_consts(%rip),%xmm0
@@ -2086,7 +2086,7 @@
 	jmp	1b
 	jmp	open_sse_tail_16
 
-.cfi_endproc	
+
 
 
 
@@ -2096,31 +2096,31 @@
 
 .p2align	6
 _chacha20_poly1305_seal:
-.cfi_startproc	
+
 	pushq	%rbp
-.cfi_adjust_cfa_offset	8
+
 	pushq	%rbx
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r12
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r13
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r14
-.cfi_adjust_cfa_offset	8
+
 	pushq	%r15
-.cfi_adjust_cfa_offset	8
+
 
 
 	pushq	%r9
-.cfi_adjust_cfa_offset	8
+
 	subq	$288 + 32,%rsp
-.cfi_adjust_cfa_offset	288 + 32
-.cfi_offset	rbp, -16
-.cfi_offset	rbx, -24
-.cfi_offset	r12, -32
-.cfi_offset	r13, -40
-.cfi_offset	r14, -48
-.cfi_offset	r15, -56
+
+
+
+
+
+
+
 	leaq	32(%rsp),%rbp
 	andq	$-32,%rbp
 	movq	%rdx,8+32(%rbp)
@@ -3717,26 +3717,26 @@
 	adcq	8+16(%rbp),%r11
 
 	addq	$288 + 32,%rsp
-.cfi_adjust_cfa_offset	-(288 + 32)
+
 	popq	%r9
-.cfi_adjust_cfa_offset	-8
+
 	movq	%r10,0(%r9)
 	movq	%r11,8(%r9)
 
 	popq	%r15
-.cfi_adjust_cfa_offset	-8
+
 	popq	%r14
-.cfi_adjust_cfa_offset	-8
+
 	popq	%r13
-.cfi_adjust_cfa_offset	-8
+
 	popq	%r12
-.cfi_adjust_cfa_offset	-8
+
 	popq	%rbx
-.cfi_adjust_cfa_offset	-8
+
 	popq	%rbp
-.cfi_adjust_cfa_offset	-8
+
 	.byte	0xf3,0xc3
-.cfi_adjust_cfa_offset	(8 * 6) + 288 + 32
+
 
 seal_sse_128:
 	movdqu	.chacha20_consts(%rip),%xmm0
@@ -8783,5 +8783,5 @@
 1:
 	vzeroupper
 	jmp	seal_sse_tail_16
-.cfi_endproc	
+
 #endif
diff --git a/mac-x86_64/crypto/modes/ghash-x86_64.S b/mac-x86_64/crypto/modes/ghash-x86_64.S
index 334f83f..814d796 100644
--- a/mac-x86_64/crypto/modes/ghash-x86_64.S
+++ b/mac-x86_64/crypto/modes/ghash-x86_64.S
@@ -10,6 +10,10 @@
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
+	pushq	%r13
+	pushq	%r14
+	pushq	%r15
+	subq	$280,%rsp
 L$gmult_prologue:
 
 	movzbq	15(%rdi),%r8
@@ -86,8 +90,9 @@
 	movq	%r8,8(%rdi)
 	movq	%r9,(%rdi)
 
-	movq	16(%rsp),%rbx
-	leaq	24(%rsp),%rsp
+	leaq	280+48(%rsp),%rsi
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$gmult_epilogue:
 	.byte	0xf3,0xc3
 
@@ -647,14 +652,14 @@
 	movq	%r8,8(%rdi)
 	movq	%r9,(%rdi)
 
-	leaq	280(%rsp),%rsi
-	movq	0(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	leaq	280+48(%rsp),%rsi
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	0(%rsi),%rsp
 L$ghash_epilogue:
 	.byte	0xf3,0xc3
 
diff --git a/mac-x86_64/crypto/sha/sha1-x86_64.S b/mac-x86_64/crypto/sha/sha1-x86_64.S
index 0509d45..cf45d8a 100644
--- a/mac-x86_64/crypto/sha/sha1-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha1-x86_64.S
@@ -1240,14 +1240,13 @@
 .p2align	4
 sha1_block_data_order_ssse3:
 _ssse3_shortcut:
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	leaq	-64(%rsp),%rsp
-	movq	%rax,%r14
 	andq	$-64,%rsp
 	movq	%rdi,%r8
 	movq	%rsi,%r9
@@ -1255,7 +1254,7 @@
 
 	shlq	$6,%r10
 	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r11
+	leaq	K_XX_XX+64(%rip),%r14
 
 	movl	0(%r8),%eax
 	movl	4(%r8),%ebx
@@ -1267,8 +1266,8 @@
 	xorl	%edx,%edi
 	andl	%edi,%esi
 
-	movdqa	64(%r11),%xmm6
-	movdqa	-64(%r11),%xmm9
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
 	movdqu	0(%r9),%xmm0
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
@@ -1344,7 +1343,7 @@
 	pslld	$2,%xmm9
 	pxor	%xmm10,%xmm4
 	xorl	%ebp,%edx
-	movdqa	-64(%r11),%xmm10
+	movdqa	-64(%r14),%xmm10
 	roll	$5,%ecx
 	addl	%edi,%ebx
 	andl	%edx,%esi
@@ -1405,7 +1404,7 @@
 	pslld	$2,%xmm10
 	pxor	%xmm8,%xmm5
 	xorl	%eax,%ebp
-	movdqa	-32(%r11),%xmm8
+	movdqa	-32(%r14),%xmm8
 	roll	$5,%edx
 	addl	%edi,%ecx
 	andl	%ebp,%esi
@@ -1466,7 +1465,7 @@
 	pslld	$2,%xmm8
 	pxor	%xmm9,%xmm6
 	xorl	%ebx,%eax
-	movdqa	-32(%r11),%xmm9
+	movdqa	-32(%r14),%xmm9
 	roll	$5,%ebp
 	addl	%edi,%edx
 	andl	%eax,%esi
@@ -1527,7 +1526,7 @@
 	pslld	$2,%xmm9
 	pxor	%xmm10,%xmm7
 	xorl	%ecx,%ebx
-	movdqa	-32(%r11),%xmm10
+	movdqa	-32(%r14),%xmm10
 	roll	$5,%eax
 	addl	%edi,%ebp
 	andl	%ebx,%esi
@@ -1638,7 +1637,7 @@
 	pxor	%xmm3,%xmm2
 	addl	%esi,%eax
 	xorl	%edx,%edi
-	movdqa	0(%r11),%xmm10
+	movdqa	0(%r14),%xmm10
 	rorl	$7,%ecx
 	paddd	%xmm1,%xmm9
 	addl	%ebx,%eax
@@ -1873,7 +1872,7 @@
 	pxor	%xmm0,%xmm7
 	roll	$5,%ebx
 	addl	%esi,%eax
-	movdqa	32(%r11),%xmm9
+	movdqa	32(%r14),%xmm9
 	xorl	%ecx,%edi
 	paddd	%xmm6,%xmm8
 	xorl	%edx,%ecx
@@ -2164,8 +2163,8 @@
 	addl	%edx,%ecx
 	cmpq	%r10,%r9
 	je	L$done_ssse3
-	movdqa	64(%r11),%xmm6
-	movdqa	-64(%r11),%xmm9
+	movdqa	64(%r14),%xmm6
+	movdqa	-64(%r14),%xmm9
 	movdqu	0(%r9),%xmm0
 	movdqu	16(%r9),%xmm1
 	movdqu	32(%r9),%xmm2
@@ -2402,13 +2401,12 @@
 	movl	%ecx,8(%r8)
 	movl	%edx,12(%r8)
 	movl	%ebp,16(%r8)
-	leaq	(%r14),%rsi
-	movq	-40(%rsi),%r14
-	movq	-32(%rsi),%r13
-	movq	-24(%rsi),%r12
-	movq	-16(%rsi),%rbp
-	movq	-8(%rsi),%rbx
-	leaq	(%rsi),%rsp
+	movq	-40(%r11),%r14
+	movq	-32(%r11),%r13
+	movq	-24(%r11),%r12
+	movq	-16(%r11),%rbp
+	movq	-8(%r11),%rbx
+	leaq	(%r11),%rsp
 L$epilogue_ssse3:
 	.byte	0xf3,0xc3
 
@@ -2416,7 +2414,7 @@
 .p2align	4
 sha1_block_data_order_avx:
 _avx_shortcut:
-	movq	%rsp,%rax
+	movq	%rsp,%r11
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
@@ -2424,7 +2422,6 @@
 	pushq	%r14
 	leaq	-64(%rsp),%rsp
 	vzeroupper
-	movq	%rax,%r14
 	andq	$-64,%rsp
 	movq	%rdi,%r8
 	movq	%rsi,%r9
@@ -2432,7 +2429,7 @@
 
 	shlq	$6,%r10
 	addq	%r9,%r10
-	leaq	K_XX_XX+64(%rip),%r11
+	leaq	K_XX_XX+64(%rip),%r14
 
 	movl	0(%r8),%eax
 	movl	4(%r8),%ebx
@@ -2444,8 +2441,8 @@
 	xorl	%edx,%edi
 	andl	%edi,%esi
 
-	vmovdqa	64(%r11),%xmm6
-	vmovdqa	-64(%r11),%xmm11
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
 	vmovdqu	0(%r9),%xmm0
 	vmovdqu	16(%r9),%xmm1
 	vmovdqu	32(%r9),%xmm2
@@ -2570,7 +2567,7 @@
 	vpxor	%xmm10,%xmm5,%xmm5
 	xorl	%eax,%ebp
 	shldl	$5,%edx,%edx
-	vmovdqa	-32(%r11),%xmm11
+	vmovdqa	-32(%r14),%xmm11
 	addl	%edi,%ecx
 	andl	%ebp,%esi
 	xorl	%eax,%ebp
@@ -2783,7 +2780,7 @@
 	addl	%esi,%eax
 	xorl	%edx,%edi
 	vpaddd	%xmm1,%xmm11,%xmm9
-	vmovdqa	0(%r11),%xmm11
+	vmovdqa	0(%r14),%xmm11
 	shrdl	$7,%ecx,%ecx
 	addl	%ebx,%eax
 	vpxor	%xmm8,%xmm2,%xmm2
@@ -3002,7 +2999,7 @@
 	movl	%ebx,%edi
 	xorl	%edx,%esi
 	vpaddd	%xmm6,%xmm11,%xmm9
-	vmovdqa	32(%r11),%xmm11
+	vmovdqa	32(%r14),%xmm11
 	shldl	$5,%ebx,%ebx
 	addl	%esi,%eax
 	vpxor	%xmm8,%xmm7,%xmm7
@@ -3281,8 +3278,8 @@
 	addl	%edx,%ecx
 	cmpq	%r10,%r9
 	je	L$done_avx
-	vmovdqa	64(%r11),%xmm6
-	vmovdqa	-64(%r11),%xmm11
+	vmovdqa	64(%r14),%xmm6
+	vmovdqa	-64(%r14),%xmm11
 	vmovdqu	0(%r9),%xmm0
 	vmovdqu	16(%r9),%xmm1
 	vmovdqu	32(%r9),%xmm2
@@ -3518,13 +3515,12 @@
 	movl	%ecx,8(%r8)
 	movl	%edx,12(%r8)
 	movl	%ebp,16(%r8)
-	leaq	(%r14),%rsi
-	movq	-40(%rsi),%r14
-	movq	-32(%rsi),%r13
-	movq	-24(%rsi),%r12
-	movq	-16(%rsi),%rbp
-	movq	-8(%rsi),%rbx
-	leaq	(%rsi),%rsp
+	movq	-40(%r11),%r14
+	movq	-32(%r11),%r13
+	movq	-24(%r11),%r12
+	movq	-16(%r11),%rbp
+	movq	-8(%r11),%rbx
+	leaq	(%r11),%rsp
 L$epilogue_avx:
 	.byte	0xf3,0xc3
 
diff --git a/mac-x86_64/crypto/sha/sha256-x86_64.S b/mac-x86_64/crypto/sha/sha256-x86_64.S
index 0146ff5..f00ef6d 100644
--- a/mac-x86_64/crypto/sha/sha256-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha256-x86_64.S
@@ -18,13 +18,13 @@
 	je	L$avx_shortcut
 	testl	$512,%r10d
 	jnz	L$ssse3_shortcut
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$64+32,%rsp
 	leaq	(%rsi,%rdx,4),%rdx
@@ -32,7 +32,7 @@
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-	movq	%r11,64+24(%rsp)
+	movq	%rax,64+24(%rsp)
 L$prologue:
 
 	movl	0(%rdi),%eax
@@ -1697,13 +1697,13 @@
 	jb	L$loop
 
 	movq	64+24(%rsp),%rsi
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$epilogue:
 	.byte	0xf3,0xc3
 
@@ -1754,13 +1754,13 @@
 .p2align	6
 sha256_block_data_order_ssse3:
 L$ssse3_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$96,%rsp
 	leaq	(%rsi,%rdx,4),%rdx
@@ -1768,7 +1768,7 @@
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-	movq	%r11,64+24(%rsp)
+	movq	%rax,64+24(%rsp)
 L$prologue_ssse3:
 
 	movl	0(%rdi),%eax
@@ -2835,13 +2835,13 @@
 	jb	L$loop_ssse3
 
 	movq	64+24(%rsp),%rsi
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$epilogue_ssse3:
 	.byte	0xf3,0xc3
 
@@ -2849,13 +2849,13 @@
 .p2align	6
 sha256_block_data_order_avx:
 L$avx_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$96,%rsp
 	leaq	(%rsi,%rdx,4),%rdx
@@ -2863,7 +2863,7 @@
 	movq	%rdi,64+0(%rsp)
 	movq	%rsi,64+8(%rsp)
 	movq	%rdx,64+16(%rsp)
-	movq	%r11,64+24(%rsp)
+	movq	%rax,64+24(%rsp)
 L$prologue_avx:
 
 	vzeroupper
@@ -3892,13 +3892,13 @@
 
 	movq	64+24(%rsp),%rsi
 	vzeroupper
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$epilogue_avx:
 	.byte	0xf3,0xc3
 
diff --git a/mac-x86_64/crypto/sha/sha512-x86_64.S b/mac-x86_64/crypto/sha/sha512-x86_64.S
index aeabd3f..eabcb3a 100644
--- a/mac-x86_64/crypto/sha/sha512-x86_64.S
+++ b/mac-x86_64/crypto/sha/sha512-x86_64.S
@@ -18,13 +18,13 @@
 	orl	%r9d,%r10d
 	cmpl	$1342177792,%r10d
 	je	L$avx_shortcut
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$128+32,%rsp
 	leaq	(%rsi,%rdx,8),%rdx
@@ -32,7 +32,7 @@
 	movq	%rdi,128+0(%rsp)
 	movq	%rsi,128+8(%rsp)
 	movq	%rdx,128+16(%rsp)
-	movq	%r11,128+24(%rsp)
+	movq	%rax,128+24(%rsp)
 L$prologue:
 
 	movq	0(%rdi),%rax
@@ -1697,13 +1697,13 @@
 	jb	L$loop
 
 	movq	128+24(%rsp),%rsi
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$epilogue:
 	.byte	0xf3,0xc3
 
@@ -1798,13 +1798,13 @@
 .p2align	6
 sha512_block_data_order_xop:
 L$xop_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$160,%rsp
 	leaq	(%rsi,%rdx,8),%rdx
@@ -1812,7 +1812,7 @@
 	movq	%rdi,128+0(%rsp)
 	movq	%rsi,128+8(%rsp)
 	movq	%rdx,128+16(%rsp)
-	movq	%r11,128+24(%rsp)
+	movq	%rax,128+24(%rsp)
 L$prologue_xop:
 
 	vzeroupper
@@ -2867,13 +2867,13 @@
 
 	movq	128+24(%rsp),%rsi
 	vzeroupper
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$epilogue_xop:
 	.byte	0xf3,0xc3
 
@@ -2881,13 +2881,13 @@
 .p2align	6
 sha512_block_data_order_avx:
 L$avx_shortcut:
+	movq	%rsp,%rax
 	pushq	%rbx
 	pushq	%rbp
 	pushq	%r12
 	pushq	%r13
 	pushq	%r14
 	pushq	%r15
-	movq	%rsp,%r11
 	shlq	$4,%rdx
 	subq	$160,%rsp
 	leaq	(%rsi,%rdx,8),%rdx
@@ -2895,7 +2895,7 @@
 	movq	%rdi,128+0(%rsp)
 	movq	%rsi,128+8(%rsp)
 	movq	%rdx,128+16(%rsp)
-	movq	%r11,128+24(%rsp)
+	movq	%rax,128+24(%rsp)
 L$prologue_avx:
 
 	vzeroupper
@@ -4014,13 +4014,13 @@
 
 	movq	128+24(%rsp),%rsi
 	vzeroupper
-	movq	(%rsi),%r15
-	movq	8(%rsi),%r14
-	movq	16(%rsi),%r13
-	movq	24(%rsi),%r12
-	movq	32(%rsi),%rbp
-	movq	40(%rsi),%rbx
-	leaq	48(%rsi),%rsp
+	movq	-48(%rsi),%r15
+	movq	-40(%rsi),%r14
+	movq	-32(%rsi),%r13
+	movq	-24(%rsi),%r12
+	movq	-16(%rsi),%rbp
+	movq	-8(%rsi),%rbx
+	leaq	(%rsi),%rsp
 L$epilogue_avx:
 	.byte	0xf3,0xc3
 
diff --git a/sources.bp b/sources.bp
index 8e13f1a..6da6257 100644
--- a/sources.bp
+++ b/sources.bp
@@ -51,6 +51,7 @@
         "src/crypto/asn1/tasn_new.c",
         "src/crypto/asn1/tasn_typ.c",
         "src/crypto/asn1/tasn_utl.c",
+        "src/crypto/asn1/time_support.c",
         "src/crypto/asn1/x_bignum.c",
         "src/crypto/asn1/x_long.c",
         "src/crypto/base64/base64.c",
@@ -202,7 +203,6 @@
         "src/crypto/thread_none.c",
         "src/crypto/thread_pthread.c",
         "src/crypto/thread_win.c",
-        "src/crypto/time_support.c",
         "src/crypto/x509/a_digest.c",
         "src/crypto/x509/a_sign.c",
         "src/crypto/x509/a_strex.c",
@@ -466,8 +466,12 @@
 cc_defaults {
     name: "boringssl_crypto_test_sources",
     srcs: [
+        "src/crypto/chacha/chacha_test.cc",
         "src/crypto/dh/dh_test.cc",
         "src/crypto/dsa/dsa_test.cc",
+        "src/crypto/ec/ec_test.cc",
+        "src/crypto/err/err_test.cc",
+        "src/crypto/rsa/rsa_test.cc",
         "src/crypto/test/gtest_main.cc",
     ],
 }
@@ -489,7 +493,6 @@
         "src/crypto/bio/bio_test.cc",
         "src/crypto/bn/bn_test.cc",
         "src/crypto/bytestring/bytestring_test.cc",
-        "src/crypto/chacha/chacha_test.cc",
         "src/crypto/cipher/aead_test.cc",
         "src/crypto/cipher/cipher_test.cc",
         "src/crypto/cmac/cmac_test.cc",
@@ -498,14 +501,12 @@
         "src/crypto/curve25519/spake25519_test.cc",
         "src/crypto/curve25519/x25519_test.cc",
         "src/crypto/digest/digest_test.cc",
-        "src/crypto/ec/ec_test.cc",
         "src/crypto/ec/example_mul.c",
         "src/crypto/ec/p256-x86_64_test.cc",
         "src/crypto/ecdh/ecdh_test.cc",
         "src/crypto/ecdsa/ecdsa_sign_test.cc",
         "src/crypto/ecdsa/ecdsa_test.cc",
         "src/crypto/ecdsa/ecdsa_verify_test.cc",
-        "src/crypto/err/err_test.cc",
         "src/crypto/evp/evp_extra_test.cc",
         "src/crypto/evp/evp_test.cc",
         "src/crypto/evp/pbkdf_test.cc",
@@ -519,7 +520,6 @@
         "src/crypto/poly1305/poly1305_test.cc",
         "src/crypto/pool/pool_test.cc",
         "src/crypto/refcount_test.cc",
-        "src/crypto/rsa/rsa_test.cc",
         "src/crypto/thread_test.c",
         "src/crypto/x509/pkcs7_test.c",
         "src/crypto/x509/x509_test.cc",
diff --git a/sources.mk b/sources.mk
index ebea6b9..2c9cfa2 100644
--- a/sources.mk
+++ b/sources.mk
@@ -49,6 +49,7 @@
   src/crypto/asn1/tasn_new.c\
   src/crypto/asn1/tasn_typ.c\
   src/crypto/asn1/tasn_utl.c\
+  src/crypto/asn1/time_support.c\
   src/crypto/asn1/x_bignum.c\
   src/crypto/asn1/x_long.c\
   src/crypto/base64/base64.c\
@@ -200,7 +201,6 @@
   src/crypto/thread_none.c\
   src/crypto/thread_pthread.c\
   src/crypto/thread_win.c\
-  src/crypto/time_support.c\
   src/crypto/x509/a_digest.c\
   src/crypto/x509/a_sign.c\
   src/crypto/x509/a_strex.c\
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e15df7a..2abf616 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -240,10 +240,6 @@
 # googletest has a very straightforward build.
 add_library(gtest third_party/googletest/src/gtest-all.cc)
 target_include_directories(gtest PRIVATE third_party/googletest)
-if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  # TODO(davidben): Make googletest pass -Wmissing-declarations.
-  set_target_properties(gtest PROPERTIES COMPILE_FLAGS "-Wno-missing-declarations")
-endif()
 
 include_directories(third_party/googletest/include)
 
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index bbc68d0..fbfc4b2 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -129,7 +129,6 @@
   thread_none.c
   thread_pthread.c
   thread_win.c
-  time_support.c
 
   $<TARGET_OBJECTS:stack>
   $<TARGET_OBJECTS:lhash>
@@ -212,9 +211,12 @@
 add_executable(
   crypto_test
 
+  chacha/chacha_test.cc
   dh/dh_test.cc
   dsa/dsa_test.cc
+  ec/ec_test.cc
   err/err_test.cc
+  rsa/rsa_test.cc
 
   $<TARGET_OBJECTS:gtest_main>
   $<TARGET_OBJECTS:test_support>
diff --git a/src/crypto/aes/asm/aes-586.pl b/src/crypto/aes/asm/aes-586.pl
index 9e6e1cc..45c19fb 100755
--- a/src/crypto/aes/asm/aes-586.pl
+++ b/src/crypto/aes/asm/aes-586.pl
@@ -116,7 +116,7 @@
 # words every cache-line is *guaranteed* to be accessed within ~50
 # cycles window. Why just SSE? Because it's needed on hyper-threading
 # CPU! Which is also why it's prefetched with 64 byte stride. Best
-# part is that it has no negative effect on performance:-)  
+# part is that it has no negative effect on performance:-)
 #
 # Version 4.3 implements switch between compact and non-compact block
 # functions in AES_cbc_encrypt depending on how much data was asked
@@ -578,7 +578,7 @@
 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
 # |          mm4          |          mm0          |
 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-# |     s3    |     s2    |     s1    |     s0    |    
+# |     s3    |     s2    |     s1    |     s0    |
 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
 # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
 # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
@@ -798,7 +798,7 @@
 
 	if ($i==3)  {	$tmp=$s[3]; &mov ($s[2],$__s1);		}##%ecx
 	elsif($i==2){	&movz	($tmp,&HB($s[3]));		}#%ebx[2]
-	else        {	&mov	($tmp,$s[3]); 
+	else        {	&mov	($tmp,$s[3]);
 			&shr	($tmp,24)			}
 			&xor	($out,&DWP(1,$te,$tmp,8));
 	if ($i<2)   {	&mov	(&DWP(4+4*$i,"esp"),$out);	}
@@ -1551,7 +1551,7 @@
 		&pxor	("mm1","mm3");		&pxor	("mm5","mm7");	# tp4
 		&pshufw	("mm3","mm1",0xb1);	&pshufw	("mm7","mm5",0xb1);
 		&pxor	("mm0","mm1");		&pxor	("mm4","mm5");	# ^= tp4
-		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= ROTATE(tp4,16)	
+		&pxor	("mm0","mm3");		&pxor	("mm4","mm7");	# ^= ROTATE(tp4,16)
 
 		&pxor	("mm3","mm3");		&pxor	("mm7","mm7");
 		&pcmpgtb("mm3","mm1");		&pcmpgtb("mm7","mm5");
@@ -2021,7 +2021,7 @@
 {
 # stack frame layout
 #             -4(%esp)		# return address	 0(%esp)
-#              0(%esp)		# s0 backing store	 4(%esp)	
+#              0(%esp)		# s0 backing store	 4(%esp)
 #              4(%esp)		# s1 backing store	 8(%esp)
 #              8(%esp)		# s2 backing store	12(%esp)
 #             12(%esp)		# s3 backing store	16(%esp)
@@ -2731,7 +2731,7 @@
 	&mov	(&DWP(80,"edi"),10);		# setup number of rounds
 	&xor	("eax","eax");
 	&jmp	(&label("exit"));
-		
+
     &set_label("12rounds");
 	&mov	("eax",&DWP(0,"esi"));		# copy first 6 dwords
 	&mov	("ebx",&DWP(4,"esi"));
diff --git a/src/crypto/aes/asm/aes-x86_64.pl b/src/crypto/aes/asm/aes-x86_64.pl
old mode 100644
new mode 100755
index ed489af..abf957c
--- a/src/crypto/aes/asm/aes-x86_64.pl
+++ b/src/crypto/aes/asm/aes-x86_64.pl
@@ -590,6 +590,7 @@
 .type	asm_AES_encrypt,\@function,3
 .hidden	asm_AES_encrypt
 asm_AES_encrypt:
+	mov	%rsp,%rax
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -598,7 +599,6 @@
 	push	%r15
 
 	# allocate frame "above" key schedule
-	mov	%rsp,%r10
 	lea	-63(%rdx),%rcx	# %rdx is key argument
 	and	\$-64,%rsp
 	sub	%rsp,%rcx
@@ -608,7 +608,7 @@
 	sub	\$32,%rsp
 
 	mov	%rsi,16(%rsp)	# save out
-	mov	%r10,24(%rsp)	# save real stack pointer
+	mov	%rax,24(%rsp)	# save original stack pointer
 .Lenc_prologue:
 
 	mov	%rdx,$key
@@ -640,13 +640,13 @@
 	mov	$s2,8($out)
 	mov	$s3,12($out)
 
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lenc_epilogue:
 	ret
 .size	asm_AES_encrypt,.-asm_AES_encrypt
@@ -1186,6 +1186,7 @@
 .type	asm_AES_decrypt,\@function,3
 .hidden	asm_AES_decrypt
 asm_AES_decrypt:
+	mov	%rsp,%rax
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1194,7 +1195,6 @@
 	push	%r15
 
 	# allocate frame "above" key schedule
-	mov	%rsp,%r10
 	lea	-63(%rdx),%rcx	# %rdx is key argument
 	and	\$-64,%rsp
 	sub	%rsp,%rcx
@@ -1204,7 +1204,7 @@
 	sub	\$32,%rsp
 
 	mov	%rsi,16(%rsp)	# save out
-	mov	%r10,24(%rsp)	# save real stack pointer
+	mov	%rax,24(%rsp)	# save original stack pointer
 .Ldec_prologue:
 
 	mov	%rdx,$key
@@ -1238,13 +1238,13 @@
 	mov	$s2,8($out)
 	mov	$s3,12($out)
 
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Ldec_epilogue:
 	ret
 .size	asm_AES_decrypt,.-asm_AES_decrypt
@@ -1286,7 +1286,7 @@
 asm_AES_set_encrypt_key:
 	push	%rbx
 	push	%rbp
-	push	%r12			# redundant, but allows to share 
+	push	%r12			# redundant, but allows to share
 	push	%r13			# exception handler...
 	push	%r14
 	push	%r15
@@ -1412,7 +1412,7 @@
 	xor	%rax,%rax
 	jmp	.Lexit
 
-.L14rounds:		
+.L14rounds:
 	mov	0(%rsi),%rax			# copy first 8 dwords
 	mov	8(%rsi),%rbx
 	mov	16(%rsi),%rcx
@@ -1660,10 +1660,9 @@
 	mov	%r9d,%r9d	# clear upper half of enc
 
 	lea	.LAES_Te(%rip),$sbox
+	lea	.LAES_Td(%rip),%r10
 	cmp	\$0,%r9
-	jne	.Lcbc_picked_te
-	lea	.LAES_Td(%rip),$sbox
-.Lcbc_picked_te:
+	cmoveq	%r10,$sbox
 
 	mov	OPENSSL_ia32cap_P(%rip),%r10d
 	cmp	\$$speed_limit,%rdx
@@ -2565,7 +2564,6 @@
 	jae	.Lin_block_prologue
 
 	mov	24(%rax),%rax		# pull saved real stack pointer
-	lea	48(%rax),%rax		# adjust...
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
diff --git a/src/crypto/aes/asm/aesni-x86.pl b/src/crypto/aes/asm/aesni-x86.pl
index 4ef84bc..e494dd1 100644
--- a/src/crypto/aes/asm/aesni-x86.pl
+++ b/src/crypto/aes/asm/aesni-x86.pl
@@ -51,7 +51,9 @@
 # Westmere	3.77/1.37	1.37	1.52	1.27
 # * Bridge	5.07/0.98	0.99	1.09	0.91
 # Haswell	4.44/0.80	0.97	1.03	0.72
+# Skylake	2.68/0.65	0.65	0.66	0.64
 # Silvermont	5.77/3.56	3.67	4.03	3.46
+# Goldmont	3.84/1.39	1.39	1.63	1.31
 # Bulldozer	5.80/0.98	1.05	1.24	0.93
 
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
@@ -1040,7 +1042,7 @@
 &set_label("ctr32_one_shortcut",16);
 	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
 	&mov	($rounds,&DWP(240,$key));
-	
+
 &set_label("ctr32_one");
 	if ($inline)
 	{   &aesni_inline_generate1("enc");	}
diff --git a/src/crypto/aes/asm/aesni-x86_64.pl b/src/crypto/aes/asm/aesni-x86_64.pl
index 55d5f30..8ae6dbf 100644
--- a/src/crypto/aes/asm/aesni-x86_64.pl
+++ b/src/crypto/aes/asm/aesni-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,7 +34,7 @@
 # ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
 # CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
 # CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
-# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07
 # OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
 # CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
 #
@@ -111,7 +118,7 @@
 # performance is achieved by interleaving instructions working on
 # independent blocks. In which case asymptotic limit for such modes
 # can be obtained by dividing above mentioned numbers by AES
-# instructions' interleave factor. Westmere can execute at most 3 
+# instructions' interleave factor. Westmere can execute at most 3
 # instructions at a time, meaning that optimal interleave factor is 3,
 # and that's where the "magic" number of 1.25 come from. "Optimal
 # interleave factor" means that increase of interleave factor does
@@ -157,16 +164,23 @@
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
 # in CTR mode AES instruction interleave factor was chosen to be 6x.
 
+# November 2015
+#
+# Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
+# chosen to be 6x.
+
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
-#		CBC en-/decrypt	CTR	XTS	ECB
+#		CBC en-/decrypt	CTR	XTS	ECB	OCB
 # Westmere	3.77/1.25	1.25	1.25	1.26
-# * Bridge	5.07/0.74	0.75	0.90	0.85
-# Haswell	4.44/0.63	0.63	0.73	0.63
-# Silvermont	5.75/3.54	3.56	4.12	3.87(*)
-# Bulldozer	5.77/0.70	0.72	0.90	0.70
+# * Bridge	5.07/0.74	0.75	0.90	0.85	0.98
+# Haswell	4.44/0.63	0.63	0.73	0.63	0.70
+# Skylake	2.62/0.63	0.63	0.63	0.63
+# Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
+# Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
+# Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
 #
 # (*)	Atom Silvermont ECB result is suboptimal because of penalties
 #	incurred by operations on %xmm8-15. As ECB is not considered
@@ -299,7 +313,7 @@
 # on 2x subroutine on Atom Silvermont account. For processors that
 # can schedule aes[enc|dec] every cycle optimal interleave factor
 # equals to corresponding instructions latency. 8x is optimal for
-# * Bridge and "super-optimal" for other Intel CPUs... 
+# * Bridge and "super-optimal" for other Intel CPUs...
 
 sub aesni_generate2 {
 my $dir=shift;
@@ -1158,7 +1172,7 @@
 # with zero-round key xor.
 {
 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
-my ($key0,$ctr)=("${key_}d","${ivp}d");
+my ($key0,$ctr)=("%ebp","${ivp}d");
 my $frame_size = 0x80 + ($win64?160:0);
 
 $code.=<<___;
@@ -1187,26 +1201,25 @@
 
 .align	16
 .Lctr32_bulk:
-	lea	(%rsp),%rax
+	lea	(%rsp),$key_			# use $key_ as frame pointer
 	push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,-0xa8(%rax)		# offload everything
-	movaps	%xmm7,-0x98(%rax)
-	movaps	%xmm8,-0x88(%rax)
-	movaps	%xmm9,-0x78(%rax)
-	movaps	%xmm10,-0x68(%rax)
-	movaps	%xmm11,-0x58(%rax)
-	movaps	%xmm12,-0x48(%rax)
-	movaps	%xmm13,-0x38(%rax)
-	movaps	%xmm14,-0x28(%rax)
-	movaps	%xmm15,-0x18(%rax)
+	movaps	%xmm6,-0xa8($key_)		# offload everything
+	movaps	%xmm7,-0x98($key_)
+	movaps	%xmm8,-0x88($key_)
+	movaps	%xmm9,-0x78($key_)
+	movaps	%xmm10,-0x68($key_)
+	movaps	%xmm11,-0x58($key_)
+	movaps	%xmm12,-0x48($key_)
+	movaps	%xmm13,-0x38($key_)
+	movaps	%xmm14,-0x28($key_)
+	movaps	%xmm15,-0x18($key_)
 .Lctr32_body:
 ___
 $code.=<<___;
-	lea	-8(%rax),%rbp
 
 	# 8 16-byte words on top of stack are counter values
 	# xor-ed with zero-round key
@@ -1258,7 +1271,7 @@
 	lea	7($ctr),%r9
 	 mov	%r10d,0x60+12(%rsp)
 	bswap	%r9d
-	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d 
+	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d
 	xor	$key0,%r9d
 	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
 	mov	%r9d,0x70+12(%rsp)
@@ -1538,7 +1551,7 @@
 
 .Lctr32_tail:
 	# note that at this point $inout0..5 are populated with
-	# counter values xor-ed with 0-round key 
+	# counter values xor-ed with 0-round key
 	lea	16($key),$key
 	cmp	\$4,$len
 	jb	.Lctr32_loop3
@@ -1678,26 +1691,26 @@
 	pxor	%xmm15,%xmm15
 ___
 $code.=<<___ if ($win64);
-	movaps	-0xa0(%rbp),%xmm6
-	movaps	%xmm0,-0xa0(%rbp)		# clear stack
-	movaps	-0x90(%rbp),%xmm7
-	movaps	%xmm0,-0x90(%rbp)
-	movaps	-0x80(%rbp),%xmm8
-	movaps	%xmm0,-0x80(%rbp)
-	movaps	-0x70(%rbp),%xmm9
-	movaps	%xmm0,-0x70(%rbp)
-	movaps	-0x60(%rbp),%xmm10
-	movaps	%xmm0,-0x60(%rbp)
-	movaps	-0x50(%rbp),%xmm11
-	movaps	%xmm0,-0x50(%rbp)
-	movaps	-0x40(%rbp),%xmm12
-	movaps	%xmm0,-0x40(%rbp)
-	movaps	-0x30(%rbp),%xmm13
-	movaps	%xmm0,-0x30(%rbp)
-	movaps	-0x20(%rbp),%xmm14
-	movaps	%xmm0,-0x20(%rbp)
-	movaps	-0x10(%rbp),%xmm15
-	movaps	%xmm0,-0x10(%rbp)
+	movaps	-0xa8($key_),%xmm6
+	movaps	%xmm0,-0xa8($key_)		# clear stack
+	movaps	-0x98($key_),%xmm7
+	movaps	%xmm0,-0x98($key_)
+	movaps	-0x88($key_),%xmm8
+	movaps	%xmm0,-0x88($key_)
+	movaps	-0x78($key_),%xmm9
+	movaps	%xmm0,-0x78($key_)
+	movaps	-0x68($key_),%xmm10
+	movaps	%xmm0,-0x68($key_)
+	movaps	-0x58($key_),%xmm11
+	movaps	%xmm0,-0x58($key_)
+	movaps	-0x48($key_),%xmm12
+	movaps	%xmm0,-0x48($key_)
+	movaps	-0x38($key_),%xmm13
+	movaps	%xmm0,-0x38($key_)
+	movaps	-0x28($key_),%xmm14
+	movaps	%xmm0,-0x28($key_)
+	movaps	-0x18($key_),%xmm15
+	movaps	%xmm0,-0x18($key_)
 	movaps	%xmm0,0x00(%rsp)
 	movaps	%xmm0,0x10(%rsp)
 	movaps	%xmm0,0x20(%rsp)
@@ -1708,8 +1721,8 @@
 	movaps	%xmm0,0x70(%rsp)
 ___
 $code.=<<___;
-	lea	(%rbp),%rsp
-	pop	%rbp
+	mov	-8($key_),%rbp
+	lea	($key_),%rsp
 .Lctr32_epilogue:
 	ret
 .size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
@@ -1726,32 +1739,32 @@
 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
 my $frame_size = 0x70 + ($win64?160:0);
+my $key_ = "%rbp";	# override so that we can use %r11 as FP
 
 $code.=<<___;
 .globl	aesni_xts_encrypt
 .type	aesni_xts_encrypt,\@function,6
 .align	16
 aesni_xts_encrypt:
-	lea	(%rsp),%rax
+	lea	(%rsp),%r11			# frame pointer
 	push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,-0xa8(%rax)		# offload everything
-	movaps	%xmm7,-0x98(%rax)
-	movaps	%xmm8,-0x88(%rax)
-	movaps	%xmm9,-0x78(%rax)
-	movaps	%xmm10,-0x68(%rax)
-	movaps	%xmm11,-0x58(%rax)
-	movaps	%xmm12,-0x48(%rax)
-	movaps	%xmm13,-0x38(%rax)
-	movaps	%xmm14,-0x28(%rax)
-	movaps	%xmm15,-0x18(%rax)
+	movaps	%xmm6,-0xa8(%r11)		# offload everything
+	movaps	%xmm7,-0x98(%r11)
+	movaps	%xmm8,-0x88(%r11)
+	movaps	%xmm9,-0x78(%r11)
+	movaps	%xmm10,-0x68(%r11)
+	movaps	%xmm11,-0x58(%r11)
+	movaps	%xmm12,-0x48(%r11)
+	movaps	%xmm13,-0x38(%r11)
+	movaps	%xmm14,-0x28(%r11)
+	movaps	%xmm15,-0x18(%r11)
 .Lxts_enc_body:
 ___
 $code.=<<___;
-	lea	-8(%rax),%rbp
 	movups	($ivp),$inout0			# load clear-text tweak
 	mov	240(%r8),$rounds		# key2->rounds
 	mov	240($key),$rnds_		# key1->rounds
@@ -2169,26 +2182,26 @@
 	pxor	%xmm15,%xmm15
 ___
 $code.=<<___ if ($win64);
-	movaps	-0xa0(%rbp),%xmm6
-	movaps	%xmm0,-0xa0(%rbp)		# clear stack
-	movaps	-0x90(%rbp),%xmm7
-	movaps	%xmm0,-0x90(%rbp)
-	movaps	-0x80(%rbp),%xmm8
-	movaps	%xmm0,-0x80(%rbp)
-	movaps	-0x70(%rbp),%xmm9
-	movaps	%xmm0,-0x70(%rbp)
-	movaps	-0x60(%rbp),%xmm10
-	movaps	%xmm0,-0x60(%rbp)
-	movaps	-0x50(%rbp),%xmm11
-	movaps	%xmm0,-0x50(%rbp)
-	movaps	-0x40(%rbp),%xmm12
-	movaps	%xmm0,-0x40(%rbp)
-	movaps	-0x30(%rbp),%xmm13
-	movaps	%xmm0,-0x30(%rbp)
-	movaps	-0x20(%rbp),%xmm14
-	movaps	%xmm0,-0x20(%rbp)
-	movaps	-0x10(%rbp),%xmm15
-	movaps	%xmm0,-0x10(%rbp)
+	movaps	-0xa8(%r11),%xmm6
+	movaps	%xmm0,-0xa8(%r11)		# clear stack
+	movaps	-0x98(%r11),%xmm7
+	movaps	%xmm0,-0x98(%r11)
+	movaps	-0x88(%r11),%xmm8
+	movaps	%xmm0,-0x88(%r11)
+	movaps	-0x78(%r11),%xmm9
+	movaps	%xmm0,-0x78(%r11)
+	movaps	-0x68(%r11),%xmm10
+	movaps	%xmm0,-0x68(%r11)
+	movaps	-0x58(%r11),%xmm11
+	movaps	%xmm0,-0x58(%r11)
+	movaps	-0x48(%r11),%xmm12
+	movaps	%xmm0,-0x48(%r11)
+	movaps	-0x38(%r11),%xmm13
+	movaps	%xmm0,-0x38(%r11)
+	movaps	-0x28(%r11),%xmm14
+	movaps	%xmm0,-0x28(%r11)
+	movaps	-0x18(%r11),%xmm15
+	movaps	%xmm0,-0x18(%r11)
 	movaps	%xmm0,0x00(%rsp)
 	movaps	%xmm0,0x10(%rsp)
 	movaps	%xmm0,0x20(%rsp)
@@ -2198,8 +2211,8 @@
 	movaps	%xmm0,0x60(%rsp)
 ___
 $code.=<<___;
-	lea	(%rbp),%rsp
-	pop	%rbp
+	mov	-8(%r11),%rbp
+	lea	(%r11),%rsp
 .Lxts_enc_epilogue:
 	ret
 .size	aesni_xts_encrypt,.-aesni_xts_encrypt
@@ -2210,26 +2223,25 @@
 .type	aesni_xts_decrypt,\@function,6
 .align	16
 aesni_xts_decrypt:
-	lea	(%rsp),%rax
+	lea	(%rsp),%r11			# frame pointer
 	push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,-0xa8(%rax)		# offload everything
-	movaps	%xmm7,-0x98(%rax)
-	movaps	%xmm8,-0x88(%rax)
-	movaps	%xmm9,-0x78(%rax)
-	movaps	%xmm10,-0x68(%rax)
-	movaps	%xmm11,-0x58(%rax)
-	movaps	%xmm12,-0x48(%rax)
-	movaps	%xmm13,-0x38(%rax)
-	movaps	%xmm14,-0x28(%rax)
-	movaps	%xmm15,-0x18(%rax)
+	movaps	%xmm6,-0xa8(%r11)		# offload everything
+	movaps	%xmm7,-0x98(%r11)
+	movaps	%xmm8,-0x88(%r11)
+	movaps	%xmm9,-0x78(%r11)
+	movaps	%xmm10,-0x68(%r11)
+	movaps	%xmm11,-0x58(%r11)
+	movaps	%xmm12,-0x48(%r11)
+	movaps	%xmm13,-0x38(%r11)
+	movaps	%xmm14,-0x28(%r11)
+	movaps	%xmm15,-0x18(%r11)
 .Lxts_dec_body:
 ___
 $code.=<<___;
-	lea	-8(%rax),%rbp
 	movups	($ivp),$inout0			# load clear-text tweak
 	mov	240($key2),$rounds		# key2->rounds
 	mov	240($key),$rnds_		# key1->rounds
@@ -2673,26 +2685,26 @@
 	pxor	%xmm15,%xmm15
 ___
 $code.=<<___ if ($win64);
-	movaps	-0xa0(%rbp),%xmm6
-	movaps	%xmm0,-0xa0(%rbp)		# clear stack
-	movaps	-0x90(%rbp),%xmm7
-	movaps	%xmm0,-0x90(%rbp)
-	movaps	-0x80(%rbp),%xmm8
-	movaps	%xmm0,-0x80(%rbp)
-	movaps	-0x70(%rbp),%xmm9
-	movaps	%xmm0,-0x70(%rbp)
-	movaps	-0x60(%rbp),%xmm10
-	movaps	%xmm0,-0x60(%rbp)
-	movaps	-0x50(%rbp),%xmm11
-	movaps	%xmm0,-0x50(%rbp)
-	movaps	-0x40(%rbp),%xmm12
-	movaps	%xmm0,-0x40(%rbp)
-	movaps	-0x30(%rbp),%xmm13
-	movaps	%xmm0,-0x30(%rbp)
-	movaps	-0x20(%rbp),%xmm14
-	movaps	%xmm0,-0x20(%rbp)
-	movaps	-0x10(%rbp),%xmm15
-	movaps	%xmm0,-0x10(%rbp)
+	movaps	-0xa8(%r11),%xmm6
+	movaps	%xmm0,-0xa8(%r11)		# clear stack
+	movaps	-0x98(%r11),%xmm7
+	movaps	%xmm0,-0x98(%r11)
+	movaps	-0x88(%r11),%xmm8
+	movaps	%xmm0,-0x88(%r11)
+	movaps	-0x78(%r11),%xmm9
+	movaps	%xmm0,-0x78(%r11)
+	movaps	-0x68(%r11),%xmm10
+	movaps	%xmm0,-0x68(%r11)
+	movaps	-0x58(%r11),%xmm11
+	movaps	%xmm0,-0x58(%r11)
+	movaps	-0x48(%r11),%xmm12
+	movaps	%xmm0,-0x48(%r11)
+	movaps	-0x38(%r11),%xmm13
+	movaps	%xmm0,-0x38(%r11)
+	movaps	-0x28(%r11),%xmm14
+	movaps	%xmm0,-0x28(%r11)
+	movaps	-0x18(%r11),%xmm15
+	movaps	%xmm0,-0x18(%r11)
 	movaps	%xmm0,0x00(%rsp)
 	movaps	%xmm0,0x10(%rsp)
 	movaps	%xmm0,0x20(%rsp)
@@ -2702,12 +2714,933 @@
 	movaps	%xmm0,0x60(%rsp)
 ___
 $code.=<<___;
-	lea	(%rbp),%rsp
-	pop	%rbp
+	mov	-8(%r11),%rbp
+	lea	(%r11),%rsp
 .Lxts_dec_epilogue:
 	ret
 .size	aesni_xts_decrypt,.-aesni_xts_decrypt
 ___
+}
+
+######################################################################
+# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
+#	const AES_KEY *key, unsigned int start_block_num,
+#	unsigned char offset_i[16], const unsigned char L_[][16],
+#	unsigned char checksum[16]);
+#
+{
+my @offset=map("%xmm$_",(10..15));
+my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
+my ($block_num,$offset_p)=("%r8","%r9");		# 5th and 6th arguments
+my ($L_p,$checksum_p) = ("%rbx","%rbp");
+my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
+my $seventh_arg = $win64 ? 56 : 8;
+my $blocks = $len;
+
+$code.=<<___;
+.globl	aesni_ocb_encrypt
+.type	aesni_ocb_encrypt,\@function,6
+.align	32
+aesni_ocb_encrypt:
+	lea	(%rsp),%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp),%rsp
+	movaps	%xmm6,0x00(%rsp)		# offload everything
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm10,0x40(%rsp)
+	movaps	%xmm11,0x50(%rsp)
+	movaps	%xmm12,0x60(%rsp)
+	movaps	%xmm13,0x70(%rsp)
+	movaps	%xmm14,0x80(%rsp)
+	movaps	%xmm15,0x90(%rsp)
+.Locb_enc_body:
+___
+$code.=<<___;
+	mov	$seventh_arg(%rax),$L_p		# 7th argument
+	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
+
+	mov	240($key),$rnds_
+	mov	$key,$key_
+	shl	\$4,$rnds_
+	$movkey	($key),$rndkey0l		# round[0]
+	$movkey	16($key,$rnds_),$rndkey1	# round[last]
+
+	movdqu	($offset_p),@offset[5]		# load last offset_i
+	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
+	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
+
+	mov	\$16+32,$rounds
+	lea	32($key_,$rnds_),$key
+	$movkey	16($key_),$rndkey1		# round[1]
+	sub	%r10,%rax			# twisted $rounds
+	mov	%rax,%r10			# backup twisted $rounds
+
+	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
+	movdqu	($checksum_p),$checksum		# load checksum
+
+	test	\$1,$block_num			# is first block number odd?
+	jnz	.Locb_enc_odd
+
+	bsf	$block_num,$i1
+	add	\$1,$block_num
+	shl	\$4,$i1
+	movdqu	($L_p,$i1),$inout5		# borrow
+	movdqu	($inp),$inout0
+	lea	16($inp),$inp
+
+	call	__ocb_encrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,($out)
+	lea	16($out),$out
+	sub	\$1,$blocks
+	jz	.Locb_enc_done
+
+.Locb_enc_odd:
+	lea	1($block_num),$i1		# even-numbered blocks
+	lea	3($block_num),$i3
+	lea	5($block_num),$i5
+	lea	6($block_num),$block_num
+	bsf	$i1,$i1				# ntz(block)
+	bsf	$i3,$i3
+	bsf	$i5,$i5
+	shl	\$4,$i1				# ntz(block) -> table offset
+	shl	\$4,$i3
+	shl	\$4,$i5
+
+	sub	\$6,$blocks
+	jc	.Locb_enc_short
+	jmp	.Locb_enc_grandloop
+
+.align	32
+.Locb_enc_grandloop:
+	movdqu	`16*0`($inp),$inout0		# load input
+	movdqu	`16*1`($inp),$inout1
+	movdqu	`16*2`($inp),$inout2
+	movdqu	`16*3`($inp),$inout3
+	movdqu	`16*4`($inp),$inout4
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+
+	call	__ocb_encrypt6
+
+	movups	$inout0,`16*0`($out)		# store output
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+	movups	$inout3,`16*3`($out)
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$6,$blocks
+	jnc	.Locb_enc_grandloop
+
+.Locb_enc_short:
+	add	\$6,$blocks
+	jz	.Locb_enc_done
+
+	movdqu	`16*0`($inp),$inout0
+	cmp	\$2,$blocks
+	jb	.Locb_enc_one
+	movdqu	`16*1`($inp),$inout1
+	je	.Locb_enc_two
+
+	movdqu	`16*2`($inp),$inout2
+	cmp	\$4,$blocks
+	jb	.Locb_enc_three
+	movdqu	`16*3`($inp),$inout3
+	je	.Locb_enc_four
+
+	movdqu	`16*4`($inp),$inout4
+	pxor	$inout5,$inout5
+
+	call	__ocb_encrypt6
+
+	movdqa	@offset[4],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+	movups	$inout3,`16*3`($out)
+	movups	$inout4,`16*4`($out)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_one:
+	movdqa	@offset[0],$inout5		# borrow
+
+	call	__ocb_encrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,`16*0`($out)
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_two:
+	pxor	$inout2,$inout2
+	pxor	$inout3,$inout3
+
+	call	__ocb_encrypt4
+
+	movdqa	@offset[1],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_three:
+	pxor	$inout3,$inout3
+
+	call	__ocb_encrypt4
+
+	movdqa	@offset[2],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+
+	jmp	.Locb_enc_done
+
+.align	16
+.Locb_enc_four:
+	call	__ocb_encrypt4
+
+	movdqa	@offset[3],@offset[5]
+	movups	$inout0,`16*0`($out)
+	movups	$inout1,`16*1`($out)
+	movups	$inout2,`16*2`($out)
+	movups	$inout3,`16*3`($out)
+
+.Locb_enc_done:
+	pxor	$rndkey0,@offset[5]		# "remove" round[last]
+	movdqu	$checksum,($checksum_p)		# store checksum
+	movdqu	@offset[5],($offset_p)		# store last offset_i
+
+	xorps	%xmm0,%xmm0			# clear register bank
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	lea	0x28(%rsp),%rax
+___
+$code.=<<___ if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	%xmm0,0x00(%rsp)		# clear stack
+	movaps	0x10(%rsp),%xmm7
+	movaps	%xmm0,0x10(%rsp)
+	movaps	0x20(%rsp),%xmm8
+	movaps	%xmm0,0x20(%rsp)
+	movaps	0x30(%rsp),%xmm9
+	movaps	%xmm0,0x30(%rsp)
+	movaps	0x40(%rsp),%xmm10
+	movaps	%xmm0,0x40(%rsp)
+	movaps	0x50(%rsp),%xmm11
+	movaps	%xmm0,0x50(%rsp)
+	movaps	0x60(%rsp),%xmm12
+	movaps	%xmm0,0x60(%rsp)
+	movaps	0x70(%rsp),%xmm13
+	movaps	%xmm0,0x70(%rsp)
+	movaps	0x80(%rsp),%xmm14
+	movaps	%xmm0,0x80(%rsp)
+	movaps	0x90(%rsp),%xmm15
+	movaps	%xmm0,0x90(%rsp)
+	lea	0xa0+0x28(%rsp),%rax
+.Locb_enc_pop:
+___
+$code.=<<___;
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp
+.Locb_enc_epilogue:
+	ret
+.size	aesni_ocb_encrypt,.-aesni_ocb_encrypt
+
+.type	__ocb_encrypt6,\@abi-omnipotent
+.align	32
+__ocb_encrypt6:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 movdqa		@offset[0],@offset[4]
+	 pxor		@offset[5],@offset[0]
+	 movdqu		($L_p,$i5),@offset[5]
+	 pxor		@offset[0],@offset[1]
+	pxor		$inout0,$checksum	# accumulate checksum
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		$inout1,$checksum
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		$inout2,$checksum
+	pxor		@offset[2],$inout2
+	 pxor		@offset[3],@offset[4]
+	pxor		$inout3,$checksum
+	pxor		@offset[3],$inout3
+	 pxor		@offset[4],@offset[5]
+	pxor		$inout4,$checksum
+	pxor		@offset[4],$inout4
+	pxor		$inout5,$checksum
+	pxor		@offset[5],$inout5
+	$movkey		32($key_),$rndkey0
+
+	lea		1($block_num),$i1	# even-numbered blocks
+	lea		3($block_num),$i3
+	lea		5($block_num),$i5
+	add		\$6,$block_num
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	bsf		$i1,$i1			# ntz(block)
+	bsf		$i3,$i3
+	bsf		$i5,$i5
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	aesenc		$rndkey1,$inout4
+	 pxor		$rndkey0l,@offset[3]
+	 pxor		$rndkey0l,@offset[4]
+	aesenc		$rndkey1,$inout5
+	$movkey		48($key_),$rndkey1
+	 pxor		$rndkey0l,@offset[5]
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	shl		\$4,$i1			# ntz(block) -> table offset
+	shl		\$4,$i3
+	jmp		.Locb_enc_loop6
+
+.align	32
+.Locb_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_enc_loop6
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+	shl		\$4,$i5
+
+	aesenclast	@offset[0],$inout0
+	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
+	mov		%r10,%rax		# restore twisted rounds
+	aesenclast	@offset[1],$inout1
+	aesenclast	@offset[2],$inout2
+	aesenclast	@offset[3],$inout3
+	aesenclast	@offset[4],$inout4
+	aesenclast	@offset[5],$inout5
+	ret
+.size	__ocb_encrypt6,.-__ocb_encrypt6
+
+.type	__ocb_encrypt4,\@abi-omnipotent
+.align	32
+__ocb_encrypt4:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 pxor		@offset[5],@offset[0]
+	 pxor		@offset[0],@offset[1]
+	pxor		$inout0,$checksum	# accumulate checksum
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		$inout1,$checksum
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		$inout2,$checksum
+	pxor		@offset[2],$inout2
+	pxor		$inout3,$checksum
+	pxor		@offset[3],$inout3
+	$movkey		32($key_),$rndkey0
+
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	 pxor		$rndkey0l,@offset[3]
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		48($key_),$rndkey1
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_enc_loop4
+
+.align	32
+.Locb_enc_loop4:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_enc_loop4
+
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	$movkey		16($key_),$rndkey1
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesenclast	@offset[0],$inout0
+	aesenclast	@offset[1],$inout1
+	aesenclast	@offset[2],$inout2
+	aesenclast	@offset[3],$inout3
+	ret
+.size	__ocb_encrypt4,.-__ocb_encrypt4
+
+.type	__ocb_encrypt1,\@abi-omnipotent
+.align	32
+__ocb_encrypt1:
+	 pxor		@offset[5],$inout5	# offset_i
+	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
+	pxor		$inout0,$checksum	# accumulate checksum
+	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
+	$movkey		32($key_),$rndkey0
+
+	aesenc		$rndkey1,$inout0
+	$movkey		48($key_),$rndkey1
+	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
+
+	aesenc		$rndkey0,$inout0
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_enc_loop1
+
+.align	32
+.Locb_enc_loop1:
+	aesenc		$rndkey1,$inout0
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesenc		$rndkey0,$inout0
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_enc_loop1
+
+	aesenc		$rndkey1,$inout0
+	$movkey		16($key_),$rndkey1	# redundant in tail
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesenclast	$inout5,$inout0
+	ret
+.size	__ocb_encrypt1,.-__ocb_encrypt1
+
+.globl	aesni_ocb_decrypt
+.type	aesni_ocb_decrypt,\@function,6
+.align	32
+aesni_ocb_decrypt:
+	lea	(%rsp),%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp),%rsp
+	movaps	%xmm6,0x00(%rsp)		# offload everything
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm10,0x40(%rsp)
+	movaps	%xmm11,0x50(%rsp)
+	movaps	%xmm12,0x60(%rsp)
+	movaps	%xmm13,0x70(%rsp)
+	movaps	%xmm14,0x80(%rsp)
+	movaps	%xmm15,0x90(%rsp)
+.Locb_dec_body:
+___
+$code.=<<___;
+	mov	$seventh_arg(%rax),$L_p		# 7th argument
+	mov	$seventh_arg+8(%rax),$checksum_p# 8th argument
+
+	mov	240($key),$rnds_
+	mov	$key,$key_
+	shl	\$4,$rnds_
+	$movkey	($key),$rndkey0l		# round[0]
+	$movkey	16($key,$rnds_),$rndkey1	# round[last]
+
+	movdqu	($offset_p),@offset[5]		# load last offset_i
+	pxor	$rndkey1,$rndkey0l		# round[0] ^ round[last]
+	pxor	$rndkey1,@offset[5]		# offset_i ^ round[last]
+
+	mov	\$16+32,$rounds
+	lea	32($key_,$rnds_),$key
+	$movkey	16($key_),$rndkey1		# round[1]
+	sub	%r10,%rax			# twisted $rounds
+	mov	%rax,%r10			# backup twisted $rounds
+
+	movdqu	($L_p),@offset[0]		# L_0 for all odd-numbered blocks
+	movdqu	($checksum_p),$checksum		# load checksum
+
+	test	\$1,$block_num			# is first block number odd?
+	jnz	.Locb_dec_odd
+
+	bsf	$block_num,$i1
+	add	\$1,$block_num
+	shl	\$4,$i1
+	movdqu	($L_p,$i1),$inout5		# borrow
+	movdqu	($inp),$inout0
+	lea	16($inp),$inp
+
+	call	__ocb_decrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,($out)
+	xorps	$inout0,$checksum		# accumulate checksum
+	lea	16($out),$out
+	sub	\$1,$blocks
+	jz	.Locb_dec_done
+
+.Locb_dec_odd:
+	lea	1($block_num),$i1		# even-numbered blocks
+	lea	3($block_num),$i3
+	lea	5($block_num),$i5
+	lea	6($block_num),$block_num
+	bsf	$i1,$i1				# ntz(block)
+	bsf	$i3,$i3
+	bsf	$i5,$i5
+	shl	\$4,$i1				# ntz(block) -> table offset
+	shl	\$4,$i3
+	shl	\$4,$i5
+
+	sub	\$6,$blocks
+	jc	.Locb_dec_short
+	jmp	.Locb_dec_grandloop
+
+.align	32
+.Locb_dec_grandloop:
+	movdqu	`16*0`($inp),$inout0		# load input
+	movdqu	`16*1`($inp),$inout1
+	movdqu	`16*2`($inp),$inout2
+	movdqu	`16*3`($inp),$inout3
+	movdqu	`16*4`($inp),$inout4
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+
+	call	__ocb_decrypt6
+
+	movups	$inout0,`16*0`($out)		# store output
+	pxor	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	pxor	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	pxor	$inout2,$checksum
+	movups	$inout3,`16*3`($out)
+	pxor	$inout3,$checksum
+	movups	$inout4,`16*4`($out)
+	pxor	$inout4,$checksum
+	movups	$inout5,`16*5`($out)
+	pxor	$inout5,$checksum
+	lea	`16*6`($out),$out
+	sub	\$6,$blocks
+	jnc	.Locb_dec_grandloop
+
+.Locb_dec_short:
+	add	\$6,$blocks
+	jz	.Locb_dec_done
+
+	movdqu	`16*0`($inp),$inout0
+	cmp	\$2,$blocks
+	jb	.Locb_dec_one
+	movdqu	`16*1`($inp),$inout1
+	je	.Locb_dec_two
+
+	movdqu	`16*2`($inp),$inout2
+	cmp	\$4,$blocks
+	jb	.Locb_dec_three
+	movdqu	`16*3`($inp),$inout3
+	je	.Locb_dec_four
+
+	movdqu	`16*4`($inp),$inout4
+	pxor	$inout5,$inout5
+
+	call	__ocb_decrypt6
+
+	movdqa	@offset[4],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	pxor	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	pxor	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	pxor	$inout2,$checksum
+	movups	$inout3,`16*3`($out)
+	pxor	$inout3,$checksum
+	movups	$inout4,`16*4`($out)
+	pxor	$inout4,$checksum
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_one:
+	movdqa	@offset[0],$inout5		# borrow
+
+	call	__ocb_decrypt1
+
+	movdqa	$inout5,@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	xorps	$inout0,$checksum		# accumulate checksum
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_two:
+	pxor	$inout2,$inout2
+	pxor	$inout3,$inout3
+
+	call	__ocb_decrypt4
+
+	movdqa	@offset[1],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	xorps	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	xorps	$inout1,$checksum
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_three:
+	pxor	$inout3,$inout3
+
+	call	__ocb_decrypt4
+
+	movdqa	@offset[2],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	xorps	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	xorps	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	xorps	$inout2,$checksum
+
+	jmp	.Locb_dec_done
+
+.align	16
+.Locb_dec_four:
+	call	__ocb_decrypt4
+
+	movdqa	@offset[3],@offset[5]
+	movups	$inout0,`16*0`($out)		# store output
+	pxor	$inout0,$checksum		# accumulate checksum
+	movups	$inout1,`16*1`($out)
+	pxor	$inout1,$checksum
+	movups	$inout2,`16*2`($out)
+	pxor	$inout2,$checksum
+	movups	$inout3,`16*3`($out)
+	pxor	$inout3,$checksum
+
+.Locb_dec_done:
+	pxor	$rndkey0,@offset[5]		# "remove" round[last]
+	movdqu	$checksum,($checksum_p)		# store checksum
+	movdqu	@offset[5],($offset_p)		# store last offset_i
+
+	xorps	%xmm0,%xmm0			# clear register bank
+	pxor	%xmm1,%xmm1
+	pxor	%xmm2,%xmm2
+	pxor	%xmm3,%xmm3
+	pxor	%xmm4,%xmm4
+	pxor	%xmm5,%xmm5
+___
+$code.=<<___ if (!$win64);
+	pxor	%xmm6,%xmm6
+	pxor	%xmm7,%xmm7
+	pxor	%xmm8,%xmm8
+	pxor	%xmm9,%xmm9
+	pxor	%xmm10,%xmm10
+	pxor	%xmm11,%xmm11
+	pxor	%xmm12,%xmm12
+	pxor	%xmm13,%xmm13
+	pxor	%xmm14,%xmm14
+	pxor	%xmm15,%xmm15
+	lea	0x28(%rsp),%rax
+___
+$code.=<<___ if ($win64);
+	movaps	0x00(%rsp),%xmm6
+	movaps	%xmm0,0x00(%rsp)		# clear stack
+	movaps	0x10(%rsp),%xmm7
+	movaps	%xmm0,0x10(%rsp)
+	movaps	0x20(%rsp),%xmm8
+	movaps	%xmm0,0x20(%rsp)
+	movaps	0x30(%rsp),%xmm9
+	movaps	%xmm0,0x30(%rsp)
+	movaps	0x40(%rsp),%xmm10
+	movaps	%xmm0,0x40(%rsp)
+	movaps	0x50(%rsp),%xmm11
+	movaps	%xmm0,0x50(%rsp)
+	movaps	0x60(%rsp),%xmm12
+	movaps	%xmm0,0x60(%rsp)
+	movaps	0x70(%rsp),%xmm13
+	movaps	%xmm0,0x70(%rsp)
+	movaps	0x80(%rsp),%xmm14
+	movaps	%xmm0,0x80(%rsp)
+	movaps	0x90(%rsp),%xmm15
+	movaps	%xmm0,0x90(%rsp)
+	lea	0xa0+0x28(%rsp),%rax
+.Locb_dec_pop:
+___
+$code.=<<___;
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp
+.Locb_dec_epilogue:
+	ret
+.size	aesni_ocb_decrypt,.-aesni_ocb_decrypt
+
+.type	__ocb_decrypt6,\@abi-omnipotent
+.align	32
+__ocb_decrypt6:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 movdqa		@offset[0],@offset[4]
+	 pxor		@offset[5],@offset[0]
+	 movdqu		($L_p,$i5),@offset[5]
+	 pxor		@offset[0],@offset[1]
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		@offset[2],$inout2
+	 pxor		@offset[3],@offset[4]
+	pxor		@offset[3],$inout3
+	 pxor		@offset[4],@offset[5]
+	pxor		@offset[4],$inout4
+	pxor		@offset[5],$inout5
+	$movkey		32($key_),$rndkey0
+
+	lea		1($block_num),$i1	# even-numbered blocks
+	lea		3($block_num),$i3
+	lea		5($block_num),$i5
+	add		\$6,$block_num
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	bsf		$i1,$i1			# ntz(block)
+	bsf		$i3,$i3
+	bsf		$i5,$i5
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	aesdec		$rndkey1,$inout4
+	 pxor		$rndkey0l,@offset[3]
+	 pxor		$rndkey0l,@offset[4]
+	aesdec		$rndkey1,$inout5
+	$movkey		48($key_),$rndkey1
+	 pxor		$rndkey0l,@offset[5]
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		64($key_),$rndkey0
+	shl		\$4,$i1			# ntz(block) -> table offset
+	shl		\$4,$i3
+	jmp		.Locb_dec_loop6
+
+.align	32
+.Locb_dec_loop6:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_dec_loop6
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+	$movkey		16($key_),$rndkey1
+	shl		\$4,$i5
+
+	aesdeclast	@offset[0],$inout0
+	movdqu		($L_p),@offset[0]	# L_0 for all odd-numbered blocks
+	mov		%r10,%rax		# restore twisted rounds
+	aesdeclast	@offset[1],$inout1
+	aesdeclast	@offset[2],$inout2
+	aesdeclast	@offset[3],$inout3
+	aesdeclast	@offset[4],$inout4
+	aesdeclast	@offset[5],$inout5
+	ret
+.size	__ocb_decrypt6,.-__ocb_decrypt6
+
+.type	__ocb_decrypt4,\@abi-omnipotent
+.align	32
+__ocb_decrypt4:
+	 pxor		$rndkey0l,@offset[5]	# offset_i ^ round[0]
+	 movdqu		($L_p,$i1),@offset[1]
+	 movdqa		@offset[0],@offset[2]
+	 movdqu		($L_p,$i3),@offset[3]
+	 pxor		@offset[5],@offset[0]
+	 pxor		@offset[0],@offset[1]
+	pxor		@offset[0],$inout0	# input ^ round[0] ^ offset_i
+	 pxor		@offset[1],@offset[2]
+	pxor		@offset[1],$inout1
+	 pxor		@offset[2],@offset[3]
+	pxor		@offset[2],$inout2
+	pxor		@offset[3],$inout3
+	$movkey		32($key_),$rndkey0
+
+	 pxor		$rndkey0l,@offset[0]	# offset_i ^ round[last]
+	 pxor		$rndkey0l,@offset[1]
+	 pxor		$rndkey0l,@offset[2]
+	 pxor		$rndkey0l,@offset[3]
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	$movkey		48($key_),$rndkey1
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_dec_loop4
+
+.align	32
+.Locb_dec_loop4:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_dec_loop4
+
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	$movkey		16($key_),$rndkey1
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesdeclast	@offset[0],$inout0
+	aesdeclast	@offset[1],$inout1
+	aesdeclast	@offset[2],$inout2
+	aesdeclast	@offset[3],$inout3
+	ret
+.size	__ocb_decrypt4,.-__ocb_decrypt4
+
+.type	__ocb_decrypt1,\@abi-omnipotent
+.align	32
+__ocb_decrypt1:
+	 pxor		@offset[5],$inout5	# offset_i
+	 pxor		$rndkey0l,$inout5	# offset_i ^ round[0]
+	pxor		$inout5,$inout0		# input ^ round[0] ^ offset_i
+	$movkey		32($key_),$rndkey0
+
+	aesdec		$rndkey1,$inout0
+	$movkey		48($key_),$rndkey1
+	pxor		$rndkey0l,$inout5	# offset_i ^ round[last]
+
+	aesdec		$rndkey0,$inout0
+	$movkey		64($key_),$rndkey0
+	jmp		.Locb_dec_loop1
+
+.align	32
+.Locb_dec_loop1:
+	aesdec		$rndkey1,$inout0
+	$movkey		($key,%rax),$rndkey1
+	add		\$32,%rax
+
+	aesdec		$rndkey0,$inout0
+	$movkey		-16($key,%rax),$rndkey0
+	jnz		.Locb_dec_loop1
+
+	aesdec		$rndkey1,$inout0
+	$movkey		16($key_),$rndkey1	# redundant in tail
+	mov		%r10,%rax		# restore twisted rounds
+
+	aesdeclast	$inout5,$inout0
+	ret
+.size	__ocb_decrypt1,.-__ocb_decrypt1
+___
 } }}
 
 ########################################################################
@@ -2717,7 +3650,6 @@
 {
 my $frame_size = 0x10 + ($win64?0xa0:0);	# used in decrypt
 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
-my $inp_=$key_;
 
 $code.=<<___;
 .globl	${PREFIX}_cbc_encrypt
@@ -2799,7 +3731,7 @@
 	jmp	.Lcbc_ret
 .align	16
 .Lcbc_decrypt_bulk:
-	lea	(%rsp),%rax
+	lea	(%rsp),%r11		# frame pointer
 	push	%rbp
 	sub	\$$frame_size,%rsp
 	and	\$-16,%rsp	# Linux kernel stack can be incorrectly seeded
@@ -2817,8 +3749,11 @@
 	movaps	%xmm15,0xa0(%rsp)
 .Lcbc_decrypt_body:
 ___
+
+my $inp_=$key_="%rbp";			# reassign $key_
+
 $code.=<<___;
-	lea	-8(%rax),%rbp
+	mov	$key,$key_		# [re-]backup $key [after reassignment]
 	movups	($ivp),$iv
 	mov	$rnds_,$rounds
 	cmp	\$0x50,$len
@@ -2858,7 +3793,7 @@
 	pxor		$rndkey0,$inout1
 	$movkey		0x10-0x70($key),$rndkey1
 	pxor		$rndkey0,$inout2
-	xor		$inp_,$inp_
+	mov		\$-1,$inp_
 	cmp		\$0x70,$len	# is there at least 0x60 bytes ahead?
 	pxor		$rndkey0,$inout3
 	pxor		$rndkey0,$inout4
@@ -2874,8 +3809,8 @@
 	aesdec		$rndkey1,$inout4
 	aesdec		$rndkey1,$inout5
 	aesdec		$rndkey1,$inout6
-	setnc		${inp_}b
-	shl		\$7,$inp_
+	adc		\$0,$inp_
+	and		\$128,$inp_
 	aesdec		$rndkey1,$inout7
 	add		$inp,$inp_
 	$movkey		0x30-0x70($key),$rndkey1
@@ -3239,8 +4174,8 @@
 	movaps	%xmm0,0xa0(%rsp)
 ___
 $code.=<<___;
-	lea	(%rbp),%rsp
-	pop	%rbp
+	mov	-8(%r11),%rbp
+	lea	(%r11),%rsp
 .Lcbc_ret:
 	ret
 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
@@ -3307,7 +4242,7 @@
 #	Vinodh Gopal <vinodh.gopal@intel.com>
 #	Kahraman Akdemir
 #
-# Agressively optimized in respect to aeskeygenassist's critical path
+# Aggressively optimized in respect to aeskeygenassist's critical path
 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
 #
 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
@@ -3811,14 +4746,76 @@
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
 
-	mov	160($context),%rax	# pull context->Rbp
-	lea	-0xa0(%rax),%rsi	# %xmm save area
+	mov	208($context),%rax	# pull context->R11
+
+	lea	-0xa8(%rax),%rsi	# %xmm save area
 	lea	512($context),%rdi	# & context.Xmm6
 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
 
-	jmp	.Lcommon_rbp_tail
+	mov	-8(%rax),%rbp		# restore saved %rbp
+	mov	%rbp,160($context)	# restore context->Rbp
+	jmp	.Lcommon_seh_tail
 .size	ctr_xts_se_handler,.-ctr_xts_se_handler
+
+.type	ocb_se_handler,\@abi-omnipotent
+.align	16
+ocb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10
+	cmp	%r10,%rbx		# context->Rip>=pop label
+	jae	.Locb_no_xmm
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xa0+0x28(%rax),%rax
+
+.Locb_no_xmm:
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+	jmp	.Lcommon_seh_tail
+.size	ocb_se_handler,.-ocb_se_handler
 ___
 $code.=<<___;
 .type	cbc_se_handler,\@abi-omnipotent
@@ -3842,9 +4839,13 @@
 	cmp	%r10,%rbx		# context->Rip<"prologue" label
 	jb	.Lcommon_seh_tail
 
+	mov	120($context),%rax	# pull context->Rax
+
 	lea	.Lcbc_decrypt_body(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
-	jb	.Lrestore_cbc_rax
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
 
 	lea	.Lcbc_ret(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip>="epilogue" label
@@ -3855,15 +4856,10 @@
 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
 
-.Lcommon_rbp_tail:
-	mov	160($context),%rax	# pull context->Rbp
-	mov	(%rax),%rbp		# restore saved %rbp
-	lea	8(%rax),%rax		# adjust stack pointer
-	mov	%rbp,160($context)	# restore context->Rbp
-	jmp	.Lcommon_seh_tail
+	mov	208($context),%rax	# pull context->R11
 
-.Lrestore_cbc_rax:
-	mov	120($context),%rax
+	mov	-8(%rax),%rbp		# restore saved %rbp
+	mov	%rbp,160($context)	# restore context->Rbp
 
 .Lcommon_seh_tail:
 	mov	8(%rax),%rdi
@@ -3932,6 +4928,14 @@
 	.rva	.LSEH_begin_aesni_xts_decrypt
 	.rva	.LSEH_end_aesni_xts_decrypt
 	.rva	.LSEH_info_xts_dec
+
+	.rva	.LSEH_begin_aesni_ocb_encrypt
+	.rva	.LSEH_end_aesni_ocb_encrypt
+	.rva	.LSEH_info_ocb_enc
+
+	.rva	.LSEH_begin_aesni_ocb_decrypt
+	.rva	.LSEH_end_aesni_ocb_decrypt
+	.rva	.LSEH_info_ocb_dec
 ___
 $code.=<<___;
 	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
@@ -3973,6 +4977,18 @@
 	.byte	9,0,0,0
 	.rva	ctr_xts_se_handler
 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+.LSEH_info_ocb_enc:
+	.byte	9,0,0,0
+	.rva	ocb_se_handler
+	.rva	.Locb_enc_body,.Locb_enc_epilogue	# HandlerData[]
+	.rva	.Locb_enc_pop
+	.long	0
+.LSEH_info_ocb_dec:
+	.byte	9,0,0,0
+	.rva	ocb_se_handler
+	.rva	.Locb_dec_body,.Locb_dec_epilogue	# HandlerData[]
+	.rva	.Locb_dec_pop
+	.long	0
 ___
 $code.=<<___;
 .LSEH_info_cbc:
diff --git a/src/crypto/aes/asm/aesv8-armx.pl b/src/crypto/aes/asm/aesv8-armx.pl
index f6d0dab..23ed77c 100644
--- a/src/crypto/aes/asm/aesv8-armx.pl
+++ b/src/crypto/aes/asm/aesv8-armx.pl
@@ -957,21 +957,21 @@
 
 	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
-		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;	
+		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
     }
 
     sub unvdup32 {
 	my $arg=shift;
 
 	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
-	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;	
+	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
     }
 
     sub unvmov32 {
 	my $arg=shift;
 
 	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
-	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;	
+	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
     }
 
     foreach(split("\n",$code)) {
diff --git a/src/crypto/aes/asm/bsaes-armv7.pl b/src/crypto/aes/asm/bsaes-armv7.pl
index 37613e2..d645de4 100644
--- a/src/crypto/aes/asm/bsaes-armv7.pl
+++ b/src/crypto/aes/asm/bsaes-armv7.pl
@@ -84,7 +84,7 @@
 
 sub InBasisChange {
 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
 my @b=@_[0..7];
 $code.=<<___;
 	veor	@b[2], @b[2], @b[1]
diff --git a/src/crypto/aes/asm/bsaes-x86_64.pl b/src/crypto/aes/asm/bsaes-x86_64.pl
index 8258f2f..9a8055e 100644
--- a/src/crypto/aes/asm/bsaes-x86_64.pl
+++ b/src/crypto/aes/asm/bsaes-x86_64.pl
@@ -41,6 +41,7 @@
 # Nehalem(**) 	7.63		6.88		+11%
 # Atom	    	17.1		16.4		+4%
 # Silvermont	-		12.9
+# Goldmont	-		8.85
 #
 # (*)	Comparison is not completely fair, because "this" is ECB,
 #	i.e. no extra processing such as counter values calculation
@@ -80,6 +81,7 @@
 # Nehalem	7.80
 # Atom		17.9
 # Silvermont	14.0
+# Goldmont	10.2
 #
 # November 2011.
 #
@@ -122,7 +124,7 @@
 
 sub InBasisChange {
 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
-# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
 my @b=@_[0..7];
 $code.=<<___;
 	pxor	@b[6], @b[5]
@@ -372,7 +374,7 @@
 	pxor	@s[0], @t[3]
 	pxor	@s[1], @t[2]
 	pxor	@s[2], @t[1]
-	pxor	@s[3], @t[0] 
+	pxor	@s[3], @t[0]
 
 	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 
@@ -1325,7 +1327,7 @@
 	cmp	%rax, %rbp
 	jb	.Lecb_enc_bzero
 
-	lea	(%rbp),%rsp		# restore %rsp
+	lea	0x78(%rbp),%rax
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -1338,17 +1340,17 @@
 	movaps	0xb0(%rbp), %xmm13
 	movaps	0xc0(%rbp), %xmm14
 	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rbp), %rsp
+	lea	0xa0(%rax), %rax
+.Lecb_enc_tail:
 ___
 $code.=<<___;
-	mov	0x48(%rsp), %r15
-	mov	0x50(%rsp), %r14
-	mov	0x58(%rsp), %r13
-	mov	0x60(%rsp), %r12
-	mov	0x68(%rsp), %rbx
-	mov	0x70(%rsp), %rax
-	lea	0x78(%rsp), %rsp
-	mov	%rax, %rbp
+	mov	-48(%rax), %r15
+	mov	-40(%rax), %r14
+	mov	-32(%rax), %r13
+	mov	-24(%rax), %r12
+	mov	-16(%rax), %rbx
+	mov	-8(%rax), %rbp
+	lea	(%rax), %rsp		# restore %rsp
 .Lecb_enc_epilogue:
 	ret
 .size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
@@ -1527,7 +1529,7 @@
 	cmp	%rax, %rbp
 	jb	.Lecb_dec_bzero
 
-	lea	(%rbp),%rsp		# restore %rsp
+	lea	0x78(%rbp),%rax
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -1540,17 +1542,17 @@
 	movaps	0xb0(%rbp), %xmm13
 	movaps	0xc0(%rbp), %xmm14
 	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rbp), %rsp
+	lea	0xa0(%rax), %rax
+.Lecb_dec_tail:
 ___
 $code.=<<___;
-	mov	0x48(%rsp), %r15
-	mov	0x50(%rsp), %r14
-	mov	0x58(%rsp), %r13
-	mov	0x60(%rsp), %r12
-	mov	0x68(%rsp), %rbx
-	mov	0x70(%rsp), %rax
-	lea	0x78(%rsp), %rsp
-	mov	%rax, %rbp
+	mov	-48(%rax), %r15
+	mov	-40(%rax), %r14
+	mov	-32(%rax), %r13
+	mov	-24(%rax), %r12
+	mov	-16(%rax), %rbx
+	mov	-8(%rax), %rbp
+	lea	(%rax), %rsp		# restore %rsp
 .Lecb_dec_epilogue:
 	ret
 .size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
@@ -1817,7 +1819,7 @@
 	cmp	%rax, %rbp
 	ja	.Lcbc_dec_bzero
 
-	lea	(%rbp),%rsp		# restore %rsp
+	lea	0x78(%rbp),%rax
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -1830,17 +1832,17 @@
 	movaps	0xb0(%rbp), %xmm13
 	movaps	0xc0(%rbp), %xmm14
 	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rbp), %rsp
+	lea	0xa0(%rax), %rax
+.Lcbc_dec_tail:
 ___
 $code.=<<___;
-	mov	0x48(%rsp), %r15
-	mov	0x50(%rsp), %r14
-	mov	0x58(%rsp), %r13
-	mov	0x60(%rsp), %r12
-	mov	0x68(%rsp), %rbx
-	mov	0x70(%rsp), %rax
-	lea	0x78(%rsp), %rsp
-	mov	%rax, %rbp
+	mov	-48(%rax), %r15
+	mov	-40(%rax), %r14
+	mov	-32(%rax), %r13
+	mov	-24(%rax), %r12
+	mov	-16(%rax), %rbx
+	mov	-8(%rax), %rbp
+	lea	(%rax), %rsp		# restore %rsp
 .Lcbc_dec_epilogue:
 	ret
 .size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
@@ -2049,7 +2051,7 @@
 	cmp	%rax, %rbp
 	ja	.Lctr_enc_bzero
 
-	lea	(%rbp),%rsp		# restore %rsp
+	lea	0x78(%rbp),%rax
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -2062,17 +2064,17 @@
 	movaps	0xb0(%rbp), %xmm13
 	movaps	0xc0(%rbp), %xmm14
 	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rbp), %rsp
+	lea	0xa0(%rax), %rax
+.Lctr_enc_tail:
 ___
 $code.=<<___;
-	mov	0x48(%rsp), %r15
-	mov	0x50(%rsp), %r14
-	mov	0x58(%rsp), %r13
-	mov	0x60(%rsp), %r12
-	mov	0x68(%rsp), %rbx
-	mov	0x70(%rsp), %rax
-	lea	0x78(%rsp), %rsp
-	mov	%rax, %rbp
+	mov	-48(%rax), %r15
+	mov	-40(%rax), %r14
+	mov	-32(%rax), %r13
+	mov	-24(%rax), %r12
+	mov	-16(%rax), %rbx
+	mov	-8(%rax), %rbp
+	lea	(%rax), %rsp		# restore %rsp
 .Lctr_enc_epilogue:
 	ret
 .size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
@@ -2439,7 +2441,7 @@
 	cmp	%rax, %rbp
 	ja	.Lxts_enc_bzero
 
-	lea	(%rbp),%rsp		# restore %rsp
+	lea	0x78(%rbp),%rax
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -2452,17 +2454,17 @@
 	movaps	0xb0(%rbp), %xmm13
 	movaps	0xc0(%rbp), %xmm14
 	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rbp), %rsp
+	lea	0xa0(%rax), %rax
+.Lxts_enc_tail:
 ___
 $code.=<<___;
-	mov	0x48(%rsp), %r15
-	mov	0x50(%rsp), %r14
-	mov	0x58(%rsp), %r13
-	mov	0x60(%rsp), %r12
-	mov	0x68(%rsp), %rbx
-	mov	0x70(%rsp), %rax
-	lea	0x78(%rsp), %rsp
-	mov	%rax, %rbp
+	mov	-48(%rax), %r15
+	mov	-40(%rax), %r14
+	mov	-32(%rax), %r13
+	mov	-24(%rax), %r12
+	mov	-16(%rax), %rbx
+	mov	-8(%rax), %rbp
+	lea	(%rax), %rsp		# restore %rsp
 .Lxts_enc_epilogue:
 	ret
 .size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
@@ -2846,7 +2848,7 @@
 	cmp	%rax, %rbp
 	ja	.Lxts_dec_bzero
 
-	lea	(%rbp),%rsp		# restore %rsp
+	lea	0x78(%rbp),%rax
 ___
 $code.=<<___ if ($win64);
 	movaps	0x40(%rbp), %xmm6
@@ -2859,17 +2861,17 @@
 	movaps	0xb0(%rbp), %xmm13
 	movaps	0xc0(%rbp), %xmm14
 	movaps	0xd0(%rbp), %xmm15
-	lea	0xa0(%rbp), %rsp
+	lea	0xa0(%rax), %rax
+.Lxts_dec_tail:
 ___
 $code.=<<___;
-	mov	0x48(%rsp), %r15
-	mov	0x50(%rsp), %r14
-	mov	0x58(%rsp), %r13
-	mov	0x60(%rsp), %r12
-	mov	0x68(%rsp), %rbx
-	mov	0x70(%rsp), %rax
-	lea	0x78(%rsp), %rsp
-	mov	%rax, %rbp
+	mov	-48(%rax), %r15
+	mov	-40(%rax), %r14
+	mov	-32(%rax), %r13
+	mov	-24(%rax), %r12
+	mov	-16(%rax), %rbx
+	mov	-8(%rax), %rbp
+	lea	(%rax), %rsp		# restore %rsp
 .Lxts_dec_epilogue:
 	ret
 .size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
@@ -2965,31 +2967,34 @@
 
 	mov	0(%r11),%r10d		# HandlerData[0]
 	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lin_prologue
-
-	mov	152($context),%rax	# pull context->Rsp
+	cmp	%r10,%rbx		# context->Rip<=prologue label
+	jbe	.Lin_prologue
 
 	mov	4(%r11),%r10d		# HandlerData[1]
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue
 
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=tail label
+	jae	.Lin_tail
+
 	mov	160($context),%rax	# pull context->Rbp
 
 	lea	0x40(%rax),%rsi		# %xmm save area
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
-	lea	0xa0(%rax),%rax		# adjust stack pointer
+	lea	0xa0+0x78(%rax),%rax	# adjust stack pointer
 
-	mov	0x70(%rax),%rbp
-	mov	0x68(%rax),%rbx
-	mov	0x60(%rax),%r12
-	mov	0x58(%rax),%r13
-	mov	0x50(%rax),%r14
-	mov	0x48(%rax),%r15
-	lea	0x78(%rax),%rax		# adjust stack pointer
+.Lin_tail:
+	mov	-48(%rax),%rbp
+	mov	-40(%rax),%rbx
+	mov	-32(%rax),%r12
+	mov	-24(%rax),%r13
+	mov	-16(%rax),%r14
+	mov	-8(%rax),%r15
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
@@ -3070,28 +3075,40 @@
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
+	.rva	.Lecb_enc_tail
+	.long	0
 .Lecb_dec_info:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
+	.rva	.Lecb_dec_tail
+	.long	0
 ___
 $code.=<<___;
 .Lcbc_dec_info:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
+	.rva	.Lcbc_dec_tail
+	.long	0
 .Lctr_enc_info:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
+	.rva	.Lctr_enc_tail
+	.long	0
 .Lxts_enc_info:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+	.rva	.Lxts_enc_tail
+	.long	0
 .Lxts_dec_info:
 	.byte	9,0,0,0
 	.rva	se_handler
 	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+	.rva	.Lxts_dec_tail
+	.long	0
 ___
 }
 
diff --git a/src/crypto/aes/asm/vpaes-x86.pl b/src/crypto/aes/asm/vpaes-x86.pl
index 4fcd561..ebf90e7 100644
--- a/src/crypto/aes/asm/vpaes-x86.pl
+++ b/src/crypto/aes/asm/vpaes-x86.pl
@@ -438,7 +438,7 @@
 ##
 &set_label("schedule_192",16);
 	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
-	&call	("_vpaes_schedule_transform");	# input transform	
+	&call	("_vpaes_schedule_transform");	# input transform
 	&movdqa	("xmm6","xmm0");		# save short part
 	&pxor	("xmm4","xmm4");		# clear 4
 	&movhlps("xmm6","xmm4");		# clobber low side with zeros
@@ -469,7 +469,7 @@
 ##
 &set_label("schedule_256",16);
 	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
-	&call	("_vpaes_schedule_transform");	# input transform	
+	&call	("_vpaes_schedule_transform");	# input transform
 	&mov	($round,7);
 
 &set_label("loop_schedule_256");
@@ -480,7 +480,7 @@
 	&call	("_vpaes_schedule_round");
 	&dec	($round);
 	&jz	(&label("schedule_mangle_last"));
-	&call	("_vpaes_schedule_mangle");	
+	&call	("_vpaes_schedule_mangle");
 
 	# low round. swap xmm7 and xmm6
 	&pshufd	("xmm0","xmm0",0xFF);
@@ -603,7 +603,7 @@
 	# subbyte
 	&movdqa	("xmm4",&QWP($k_s0F,$const));
 	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
-	&movdqa	("xmm1","xmm4");	
+	&movdqa	("xmm1","xmm4");
 	&pandn	("xmm1","xmm0");
 	&psrld	("xmm1",4);			# 1 = i
 	&pand	("xmm0","xmm4");		# 0 = k
diff --git a/src/crypto/aes/asm/vpaes-x86_64.pl b/src/crypto/aes/asm/vpaes-x86_64.pl
index 3f99e36..7a24e0d 100644
--- a/src/crypto/aes/asm/vpaes-x86_64.pl
+++ b/src/crypto/aes/asm/vpaes-x86_64.pl
@@ -31,6 +31,7 @@
 # Nehalem	29.6/40.3/14.6		10.0/11.8
 # Atom		57.3/74.2/32.1		60.9/77.2(***)
 # Silvermont	52.7/64.0/19.5		48.8/60.8(***)
+# Goldmont	38.9/49.0/17.8		10.6/12.6
 #
 # (*)	"Hyper-threading" in the context refers rather to cache shared
 #	among multiple cores, than to specifically Intel HTT. As vast
@@ -164,7 +165,7 @@
 	pshufb	%xmm1,	%xmm0
 	ret
 .size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
-	
+
 ##
 ##  Decryption core
 ##
@@ -325,7 +326,7 @@
 ##
 .Lschedule_128:
 	mov	\$10, %esi
-	
+
 .Loop_schedule_128:
 	call 	_vpaes_schedule_round
 	dec	%rsi
@@ -359,7 +360,7 @@
 
 .Loop_schedule_192:
 	call	_vpaes_schedule_round
-	palignr	\$8,%xmm6,%xmm0	
+	palignr	\$8,%xmm6,%xmm0
 	call	_vpaes_schedule_mangle	# save key n
 	call	_vpaes_schedule_192_smear
 	call	_vpaes_schedule_mangle	# save key n+1
@@ -385,7 +386,7 @@
 	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
 	call	_vpaes_schedule_transform	# input transform
 	mov	\$7, %esi
-	
+
 .Loop_schedule_256:
 	call	_vpaes_schedule_mangle	# output low result
 	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
@@ -394,7 +395,7 @@
 	call	_vpaes_schedule_round
 	dec	%rsi
 	jz 	.Lschedule_mangle_last
-	call	_vpaes_schedule_mangle	
+	call	_vpaes_schedule_mangle
 
 	# low round. swap xmm7 and xmm6
 	pshufd	\$0xFF,	%xmm0,	%xmm0
@@ -402,10 +403,10 @@
 	movdqa	%xmm6,	%xmm7
 	call	_vpaes_schedule_low_round
 	movdqa	%xmm5,	%xmm7
-	
+
 	jmp	.Loop_schedule_256
 
-	
+
 ##
 ##  .aes_schedule_mangle_last
 ##
@@ -504,9 +505,9 @@
 	# rotate
 	pshufd	\$0xFF,	%xmm0,	%xmm0
 	palignr	\$1,	%xmm0,	%xmm0
-	
+
 	# fall through...
-	
+
 	# low round: same as high round, but no rotation and no rcon.
 _vpaes_schedule_low_round:
 	# smear xmm7
@@ -545,7 +546,7 @@
 	pxor	%xmm4, 	%xmm0		# 0 = sbox output
 
 	# add in smeared stuff
-	pxor	%xmm7,	%xmm0	
+	pxor	%xmm7,	%xmm0
 	movdqa	%xmm0,	%xmm7
 	ret
 .size	_vpaes_schedule_round,.-_vpaes_schedule_round
diff --git a/src/crypto/asn1/CMakeLists.txt b/src/crypto/asn1/CMakeLists.txt
index 25d8ba2..cd1ee8c 100644
--- a/src/crypto/asn1/CMakeLists.txt
+++ b/src/crypto/asn1/CMakeLists.txt
@@ -35,6 +35,7 @@
   tasn_new.c
   tasn_typ.c
   tasn_utl.c
+  time_support.c
   x_bignum.c
   x_long.c
 )
diff --git a/src/crypto/asn1/a_gentm.c b/src/crypto/asn1/a_gentm.c
index 2f29868..d130cdf 100644
--- a/src/crypto/asn1/a_gentm.c
+++ b/src/crypto/asn1/a_gentm.c
@@ -61,7 +61,6 @@
 
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/time_support.h>
 
 #include "asn1_locl.h"
 
diff --git a/src/crypto/asn1/a_time.c b/src/crypto/asn1/a_time.c
index a12b38f..4b58429 100644
--- a/src/crypto/asn1/a_time.c
+++ b/src/crypto/asn1/a_time.c
@@ -63,7 +63,6 @@
 #include <openssl/buf.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/time_support.h>
 
 #include "asn1_locl.h"
 
diff --git a/src/crypto/asn1/a_utctm.c b/src/crypto/asn1/a_utctm.c
index 3b9d257..193b83f 100644
--- a/src/crypto/asn1/a_utctm.c
+++ b/src/crypto/asn1/a_utctm.c
@@ -61,7 +61,6 @@
 
 #include <openssl/err.h>
 #include <openssl/mem.h>
-#include <openssl/time_support.h>
 
 #include "asn1_locl.h"
 
diff --git a/src/crypto/asn1/asn1_locl.h b/src/crypto/asn1/asn1_locl.h
index 982bfd6..ce8146b 100644
--- a/src/crypto/asn1/asn1_locl.h
+++ b/src/crypto/asn1/asn1_locl.h
@@ -57,7 +57,42 @@
  *
  */
 
+#ifndef OPENSSL_HEADER_ASN1_ASN1_LOCL_H
+#define OPENSSL_HEADER_ASN1_ASN1_LOCL_H
+
+#include <time.h>
+
+#include <openssl/asn1.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+/* Wrapper functions for time functions. */
+
+/* OPENSSL_gmtime wraps |gmtime_r|. See the manual page for that function. */
+struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result);
+
+/* OPENSSL_gmtime_adj updates |tm| by adding |offset_day| days and |offset_sec|
+ * seconds. */
+int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec);
+
+/* OPENSSL_gmtime_diff calculates the difference between |from| and |to| and
+ * outputs the difference as a number of days and seconds in |*out_days| and
+ * |*out_secs|. */
+int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from,
+                        const struct tm *to);
+
+
 /* Internal ASN1 structures and functions: not for application use */
 
 int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d);
 int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d);
+
+
+#if defined(__cplusplus)
+}  /* extern C */
+#endif
+
+#endif  /* OPENSSL_HEADER_ASN1_ASN1_LOCL_H */
diff --git a/src/crypto/asn1/tasn_dec.c b/src/crypto/asn1/tasn_dec.c
index 40778a8..bf008af 100644
--- a/src/crypto/asn1/tasn_dec.c
+++ b/src/crypto/asn1/tasn_dec.c
@@ -180,6 +180,7 @@
     int ret = 0;
     ASN1_VALUE **pchptr, *ptmpval;
     int combine = aclass & ASN1_TFLG_COMBINE;
+    aclass &= ~ASN1_TFLG_COMBINE;
     if (!pval)
         return 0;
     if (aux && aux->asn1_cb)
@@ -667,6 +668,7 @@
             }
             len -= p - q;
             if (!sk_ASN1_VALUE_push((STACK_OF(ASN1_VALUE) *)*val, skfield)) {
+                ASN1_item_ex_free(&skfield, ASN1_ITEM_ptr(tt->item));
                 OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
                 goto err;
             }
diff --git a/src/crypto/asn1/tasn_new.c b/src/crypto/asn1/tasn_new.c
index 053b732..10cf954 100644
--- a/src/crypto/asn1/tasn_new.c
+++ b/src/crypto/asn1/tasn_new.c
@@ -160,7 +160,7 @@
         }
         asn1_set_choice_selector(pval, -1, it);
         if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL))
-            goto auxerr;
+            goto auxerr2;
         break;
 
     case ASN1_ITYPE_NDEF_SEQUENCE:
@@ -188,10 +188,10 @@
         for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) {
             pseqval = asn1_get_field_ptr(pval, tt);
             if (!ASN1_template_new(pseqval, tt))
-                goto memerr;
+                goto memerr2;
         }
         if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL))
-            goto auxerr;
+            goto auxerr2;
         break;
     }
 #ifdef CRYPTO_MDEBUG
@@ -200,18 +200,20 @@
 #endif
     return 1;
 
+ memerr2:
+    ASN1_item_ex_free(pval, it);
  memerr:
     OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE);
-    ASN1_item_ex_free(pval, it);
 #ifdef CRYPTO_MDEBUG
     if (it->sname)
         CRYPTO_pop_info();
 #endif
     return 0;
 
+ auxerr2:
+    ASN1_item_ex_free(pval, it);
  auxerr:
     OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR);
-    ASN1_item_ex_free(pval, it);
 #ifdef CRYPTO_MDEBUG
     if (it->sname)
         CRYPTO_pop_info();
diff --git a/src/crypto/time_support.c b/src/crypto/asn1/time_support.c
similarity index 99%
rename from src/crypto/time_support.c
rename to src/crypto/asn1/time_support.c
index ae0f496..194dc3a 100644
--- a/src/crypto/time_support.c
+++ b/src/crypto/asn1/time_support.c
@@ -59,7 +59,7 @@
 #define _POSIX_C_SOURCE 201410L  /* for gmtime_r */
 #endif
 
-#include <openssl/time_support.h>
+#include "asn1_locl.h"
 
 #include <time.h>
 
diff --git a/src/crypto/bn/asm/armv4-mont.pl b/src/crypto/bn/asm/armv4-mont.pl
index cad5955..d7298d2 100644
--- a/src/crypto/bn/asm/armv4-mont.pl
+++ b/src/crypto/bn/asm/armv4-mont.pl
@@ -16,7 +16,7 @@
 # [depending on key length, less for longer keys] on ARM920T, and
 # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
 # base and compiler generated code with in-lined umull and even umlal
-# instructions. The latter means that this code didn't really have an 
+# instructions. The latter means that this code didn't really have an
 # "advantage" of utilizing some "secret" instruction.
 #
 # The code is interoperable with Thumb ISA and is rather compact, less
diff --git a/src/crypto/bn/asm/bn-586.pl b/src/crypto/bn/asm/bn-586.pl
index 096bb9c..ccc9451 100644
--- a/src/crypto/bn/asm/bn-586.pl
+++ b/src/crypto/bn/asm/bn-586.pl
@@ -47,7 +47,7 @@
 		&movd("mm0",&wparam(3));	# mm0 = w
 		&pxor("mm1","mm1");		# mm1 = carry_in
 		&jmp(&label("maw_sse2_entry"));
-		
+
 	&set_label("maw_sse2_unrolled",16);
 		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
 		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
@@ -668,20 +668,20 @@
 	    &adc($c,0);
 	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
 	}
-	    
+
 	&comment("");
 	&add($b,32);
 	&add($r,32);
 	&sub($num,8);
 	&jnz(&label("pw_neg_loop"));
-	    
+
 	&set_label("pw_neg_finish",0);
 	&mov($tmp2,&wparam(4));	# get dl
 	&mov($num,0);
 	&sub($num,$tmp2);
 	&and($num,7);
 	&jz(&label("pw_end"));
-	    
+
 	for ($i=0; $i<7; $i++)
 	{
 	    &comment("dl<0 Tail Round $i");
@@ -698,9 +698,9 @@
 	}
 
 	&jmp(&label("pw_end"));
-	
+
 	&set_label("pw_pos",0);
-	
+
 	&and($num,0xfffffff8);	# num / 8
 	&jz(&label("pw_pos_finish"));
 
@@ -715,18 +715,18 @@
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &jnc(&label("pw_nc".$i));
 	}
-	    
+
 	&comment("");
 	&add($a,32);
 	&add($r,32);
 	&sub($num,8);
 	&jnz(&label("pw_pos_loop"));
-	    
+
 	&set_label("pw_pos_finish",0);
 	&mov($num,&wparam(4));	# get dl
 	&and($num,7);
 	&jz(&label("pw_end"));
-	    
+
 	for ($i=0; $i<7; $i++)
 	{
 	    &comment("dl>0 Tail Round $i");
@@ -747,17 +747,17 @@
 	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 	    &set_label("pw_nc".$i,0);
 	}
-	    
+
 	&comment("");
 	&add($a,32);
 	&add($r,32);
 	&sub($num,8);
 	&jnz(&label("pw_nc_loop"));
-	    
+
 	&mov($num,&wparam(4));	# get dl
 	&and($num,7);
 	&jz(&label("pw_nc_end"));
-	    
+
 	for ($i=0; $i<7; $i++)
 	{
 	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
diff --git a/src/crypto/bn/asm/co-586.pl b/src/crypto/bn/asm/co-586.pl
index ec3ea34..c63e562 100644
--- a/src/crypto/bn/asm/co-586.pl
+++ b/src/crypto/bn/asm/co-586.pl
@@ -41,7 +41,7 @@
 	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
 	 ###
 	&adc($c2,0);
-	 # is pos > 1, it means it is the last loop 
+	 # is pos > 1, it means it is the last loop
 	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
 	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
 	}
@@ -70,7 +70,7 @@
 	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
 	 ###
 	&adc($c2,0);
-	 # is pos > 1, it means it is the last loop 
+	 # is pos > 1, it means it is the last loop
 	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
 	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
 	}
@@ -121,7 +121,7 @@
 	$c2="ebp";
 	$a="esi";
 	$b="edi";
-	
+
 	$as=0;
 	$ae=0;
 	$bs=0;
@@ -136,9 +136,9 @@
 	 &push("ebx");
 
 	&xor($c0,$c0);
-	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word
 	&xor($c1,$c1);
-	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second
 
 	for ($i=0; $i<$tot; $i++)
 		{
@@ -146,7 +146,7 @@
 		$bi=$bs;
 		$end=$be+1;
 
-		&comment("################## Calculate word $i"); 
+		&comment("################## Calculate word $i");
 
 		for ($j=$bs; $j<$end; $j++)
 			{
diff --git a/src/crypto/bn/asm/rsaz-avx2.pl b/src/crypto/bn/asm/rsaz-avx2.pl
index b8e830e..5562d69 100755
--- a/src/crypto/bn/asm/rsaz-avx2.pl
+++ b/src/crypto/bn/asm/rsaz-avx2.pl
@@ -145,13 +145,21 @@
 .type	rsaz_1024_sqr_avx2,\@function,5
 .align	64
 rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
+.cfi_startproc
 	lea	(%rsp), %rax
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
@@ -170,6 +178,7 @@
 ___
 $code.=<<___;
 	mov	%rax,%rbp
+.cfi_def_cfa_register	%rbp
 	mov	%rdx, $np			# reassigned argument
 	sub	\$$FrameSize, %rsp
 	mov	$np, $tmp
@@ -359,7 +368,7 @@
 	vpaddq		$TEMP1, $ACC1, $ACC1
 	vpmuludq	32*7-128($aap), $B2, $ACC2
 	 vpbroadcastq	32*5-128($tpa), $B2
-	vpaddq		32*11-448($tp1), $ACC2, $ACC2	
+	vpaddq		32*11-448($tp1), $ACC2, $ACC2
 
 	vmovdqu		$ACC6, 32*6-192($tp0)
 	vmovdqu		$ACC7, 32*7-192($tp0)
@@ -418,7 +427,7 @@
 	vmovdqu		$ACC7, 32*16-448($tp1)
 	lea		8($tp1), $tp1
 
-	dec	$i        
+	dec	$i
 	jnz	.LOOP_SQR_1024
 ___
 $ZERO = $ACC9;
@@ -763,7 +772,7 @@
 	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
 	vpaddq		$TEMP3, $ACC7, $ACC7
 	vpaddq		$TEMP4, $ACC8, $ACC8
-     
+
 	vpsrlq		\$29, $ACC4, $TEMP1
 	vpand		$AND_MASK, $ACC4, $ACC4
 	vpsrlq		\$29, $ACC5, $TEMP2
@@ -802,6 +811,7 @@
 
 	vzeroall
 	mov	%rbp, %rax
+.cfi_def_cfa_register	%rax
 ___
 $code.=<<___ if ($win64);
 .Lsqr_1024_in_tail:
@@ -818,14 +828,22 @@
 ___
 $code.=<<___;
 	mov	-48(%rax),%r15
+.cfi_restore	%r15
 	mov	-40(%rax),%r14
+.cfi_restore	%r14
 	mov	-32(%rax),%r13
+.cfi_restore	%r13
 	mov	-24(%rax),%r12
+.cfi_restore	%r12
 	mov	-16(%rax),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rax),%rbx
+.cfi_restore	%rbx
 	lea	(%rax),%rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lsqr_1024_epilogue:
 	ret
+.cfi_endproc
 .size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
 ___
 }
@@ -878,13 +896,21 @@
 .type	rsaz_1024_mul_avx2,\@function,5
 .align	64
 rsaz_1024_mul_avx2:
+.cfi_startproc
 	lea	(%rsp), %rax
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 ___
 $code.=<<___ if ($win64);
 	vzeroupper
@@ -903,6 +929,7 @@
 ___
 $code.=<<___;
 	mov	%rax,%rbp
+.cfi_def_cfa_register	%rbp
 	vzeroall
 	mov	%rdx, $bp	# reassigned argument
 	sub	\$64,%rsp
@@ -1429,13 +1456,14 @@
 	vpaddq		$TEMP4, $ACC8, $ACC8
 
 	vmovdqu		$ACC4, 128-128($rp)
-	vmovdqu		$ACC5, 160-128($rp)    
+	vmovdqu		$ACC5, 160-128($rp)
 	vmovdqu		$ACC6, 192-128($rp)
 	vmovdqu		$ACC7, 224-128($rp)
 	vmovdqu		$ACC8, 256-128($rp)
 	vzeroupper
 
 	mov	%rbp, %rax
+.cfi_def_cfa_register	%rax
 ___
 $code.=<<___ if ($win64);
 .Lmul_1024_in_tail:
@@ -1452,14 +1480,22 @@
 ___
 $code.=<<___;
 	mov	-48(%rax),%r15
+.cfi_restore	%r15
 	mov	-40(%rax),%r14
+.cfi_restore	%r14
 	mov	-32(%rax),%r13
+.cfi_restore	%r13
 	mov	-24(%rax),%r12
+.cfi_restore	%r12
 	mov	-16(%rax),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rax),%rbx
+.cfi_restore	%rbx
 	lea	(%rax),%rsp		# restore %rsp
+.cfi_def_cfa_register	%rsp
 .Lmul_1024_epilogue:
 	ret
+.cfi_endproc
 .size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
 ___
 }
@@ -1578,8 +1614,10 @@
 .type	rsaz_1024_gather5_avx2,\@abi-omnipotent
 .align	32
 rsaz_1024_gather5_avx2:
+.cfi_startproc
 	vzeroupper
 	mov	%rsp,%r11
+.cfi_def_cfa_register	%r11
 ___
 $code.=<<___ if ($win64);
 	lea	-0x88(%rsp),%rax
@@ -1717,11 +1755,13 @@
 	movaps	-0x38(%r11),%xmm13
 	movaps	-0x28(%r11),%xmm14
 	movaps	-0x18(%r11),%xmm15
-.LSEH_end_rsaz_1024_gather5:
 ___
 $code.=<<___;
 	lea	(%r11),%rsp
+.cfi_def_cfa_register	%rsp
 	ret
+.cfi_endproc
+.LSEH_end_rsaz_1024_gather5:
 .size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
 ___
 }
diff --git a/src/crypto/bn/asm/x86-mont.pl b/src/crypto/bn/asm/x86-mont.pl
old mode 100644
new mode 100755
index 4b5d05d..57fbf10
--- a/src/crypto/bn/asm/x86-mont.pl
+++ b/src/crypto/bn/asm/x86-mont.pl
@@ -32,7 +32,7 @@
 
 $output = pop;
 open STDOUT,">$output";
- 
+
 &asm_init($ARGV[0],$0);
 
 $sse2=0;
@@ -66,33 +66,57 @@
 
 	&lea	("esi",&wparam(0));	# put aside pointer to argument block
 	&lea	("edx",&wparam(1));	# load ap
-	&mov	("ebp","esp");		# saved stack pointer!
 	&add	("edi",2);		# extra two words on top of tp
 	&neg	("edi");
-	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
 	&neg	("edi");
 
 	# minimize cache contention by arraning 2K window between stack
 	# pointer and ap argument [np is also position sensitive vector,
 	# but it's assumed to be near ap, as it's allocated at ~same
 	# time].
-	&mov	("eax","esp");
+	&mov	("eax","ebp");
 	&sub	("eax","edx");
 	&and	("eax",2047);
-	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
 
-	&xor	("edx","esp");
+	&xor	("edx","ebp");
 	&and	("edx",2048);
 	&xor	("edx",2048);
-	&sub	("esp","edx");		# this splits them apart modulo 4096
+	&sub	("ebp","edx");		# this splits them apart modulo 4096
 
-	&and	("esp",-64);		# align to cache line
+	&and	("ebp",-64);		# align to cache line
+
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	&mov	("eax","esp");
+	&sub	("eax","ebp");
+	&and	("eax",-4096);
+	&mov	("edx","esp");		# saved stack pointer!
+	&lea	("esp",&DWP(0,"ebp","eax"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+	&jmp	(&label("page_walk_done"));
+
+&set_label("page_walk",16);
+	&lea	("esp",&DWP(-4096,"esp"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+&set_label("page_walk_done");
 
 	################################# load argument block...
 	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
 	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
 	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
-	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
 	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
 	#&mov	("edi",&DWP(5*4,"esi"));# int num
 
@@ -100,11 +124,11 @@
 	&mov	($_rp,"eax");		# ... save a copy of argument block
 	&mov	($_ap,"ebx");
 	&mov	($_bp,"ecx");
-	&mov	($_np,"edx");
+	&mov	($_np,"ebp");
 	&mov	($_n0,"esi");
 	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
 	#&mov	($_num,$num);		# redundant as $num is not reused
-	&mov	($_sp,"ebp");		# saved stack pointer!
+	&mov	($_sp,"edx");		# saved stack pointer!
 
 if($sse2) {
 $acc0="mm0";	# mmx register bank layout
@@ -270,7 +294,7 @@
 	&xor	("eax","eax");	# signal "not fast enough [yet]"
 	&jmp	(&label("just_leave"));
 	# While the below code provides competitive performance for
-	# all key lengthes on modern Intel cores, it's still more
+	# all key lengths on modern Intel cores, it's still more
 	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 	# means compared to the original integer-only assembler.
 	# 512-bit RSA sign is better by ~40%, but that's about all
@@ -573,15 +597,16 @@
 	&jge	(&label("sub"));
 
 	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
 
 &set_label("copy",16);				# copy or in-place refresh
-	&mov	("edx",&DWP(0,$tp,$num,4));
-	&mov	($np,&DWP(0,$rp,$num,4));
-	&xor	("edx",$np);			# conditional select
-	&and	("edx","eax");
-	&xor	("edx",$np);
-	&mov	(&DWP(0,$tp,$num,4),$j)		# zap temporary vector
-	&mov	(&DWP(0,$rp,$num,4),"edx");	# rp[i]=tp[i]
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
 	&dec	($num);
 	&jge	(&label("copy"));
 
diff --git a/src/crypto/bn/asm/x86_64-mont.pl b/src/crypto/bn/asm/x86_64-mont.pl
index 60e0111..5775f65 100755
--- a/src/crypto/bn/asm/x86_64-mont.pl
+++ b/src/crypto/bn/asm/x86_64-mont.pl
@@ -84,6 +84,10 @@
 .type	bn_mul_mont,\@function,6
 .align	16
 bn_mul_mont:
+.cfi_startproc
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	test	\$3,${num}d
 	jnz	.Lmul_enter
 	cmp	\$8,${num}d
@@ -102,20 +106,50 @@
 .align	16
 .Lmul_enter:
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 
-	mov	${num}d,${num}d
-	lea	2($num),%r10
+	neg	$num
 	mov	%rsp,%r11
-	neg	%r10
-	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
-	and	\$-1024,%rsp		# minimize TLB usage
+	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
+	neg	$num			# restore $num
+	and	\$-1024,%r10		# minimize TLB usage
 
-	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	sub	%r10,%r11
+	and	\$-4096,%r11
+	lea	(%r10,%r11),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.align	16
+.Lmul_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
 .Lmul_body:
 	mov	$bp,%r12		# reassign $bp
 ___
@@ -265,36 +299,46 @@
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	dec	$j			# doesn't affect CF!
+	dec	$j			# doesnn't affect CF!
 	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
 	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
 	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	(%rsp,$i,8),$ap
-	mov	($rp,$i,8),$np
-	xor	$np,$ap			# conditional select:
-	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
-	xor	$np,$ap			# ap = borrow?tp:rp
+	mov	($ap,$i,8),%rax
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy
 
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+.cfi_def_cfa	%rsi,8
 	mov	\$1,%rax
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul_epilogue:
 	ret
+.cfi_endproc
 .size	bn_mul_mont,.-bn_mul_mont
 ___
 {{{
@@ -304,6 +348,10 @@
 .type	bn_mul4x_mont,\@function,6
 .align	16
 bn_mul4x_mont:
+.cfi_startproc
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -313,20 +361,41 @@
 ___
 $code.=<<___;
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 
-	mov	${num}d,${num}d
-	lea	4($num),%r10
+	neg	$num
 	mov	%rsp,%r11
-	neg	%r10
-	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
-	and	\$-1024,%rsp		# minimize TLB usage
+	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
+	neg	$num			# restore
+	and	\$-1024,%r10		# minimize TLB usage
 
-	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+	sub	%r10,%r11
+	and	\$-4096,%r11
+	lea	(%r10,%r11),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
 .Lmul4x_body:
 	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
 	mov	%rdx,%r12		# reassign $bp
@@ -633,9 +702,11 @@
 my @ri=("%rax","%rdx",$m0,$m1);
 $code.=<<___;
 	mov	16(%rsp,$num,8),$rp	# restore $rp
+	lea	-4($num),$j
 	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
 	mov	8(%rsp),@ri[1]		# tp[1]
-	shr	\$2,$num		# num/=4
+	shr	\$2,$j			# j=num/4-1
 	lea	(%rsp),$ap		# borrow ap for tp
 	xor	$i,$i			# i=0 and clear CF!
 
@@ -643,7 +714,6 @@
 	mov	16($ap),@ri[2]		# tp[2]
 	mov	24($ap),@ri[3]		# tp[3]
 	sbb	8($np),@ri[1]
-	lea	-1($num),$j		# j=num/4-1
 	jmp	.Lsub4x
 .align	16
 .Lsub4x:
@@ -671,50 +741,58 @@
 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
 
 	sbb	\$0,@ri[0]		# handle upmost overflow bit
-	mov	@ri[0],%xmm0
-	punpcklqdq %xmm0,%xmm0		# extend mask to 128 bits
 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
 	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-4($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+	shr	\$2,$j			# j=num/4-1
 
-	mov	$num,$j
-	pxor	%xmm5,%xmm5
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
 	jmp	.Lcopy4x
 .align	16
-.Lcopy4x:				# copy or in-place refresh
-	movdqu	(%rsp,$i),%xmm2
-	movdqu  16(%rsp,$i),%xmm4
-	movdqu	($rp,$i),%xmm1
-	movdqu	16($rp,$i),%xmm3
-	pxor	%xmm1,%xmm2		# conditional select
-	pxor	%xmm3,%xmm4
-	pand	%xmm0,%xmm2
-	pand	%xmm0,%xmm4
-	pxor	%xmm1,%xmm2
-	pxor	%xmm3,%xmm4
-	movdqu	%xmm2,($rp,$i)
-	movdqu  %xmm4,16($rp,$i)
-	movdqa	%xmm5,(%rsp,$i)		# zap temporary vectors
-	movdqa	%xmm5,16(%rsp,$i)
-
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
 	lea	32($i),$i
 	dec	$j
 	jnz	.Lcopy4x
 
-	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
 ___
 }
 $code.=<<___;
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+.cfi_def_cfa	%rsi, 8
 	mov	\$1,%rax
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+.cfi_restore	%r15
+	mov	-40(%rsi),%r14
+.cfi_restore	%r14
+	mov	-32(%rsi),%r13
+.cfi_restore	%r13
+	mov	-24(%rsi),%r12
+.cfi_restore	%r12
+	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
+	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
+	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul4x_epilogue:
 	ret
+.cfi_endproc
 .size	bn_mul4x_mont,.-bn_mul4x_mont
 ___
 }}}
@@ -742,14 +820,23 @@
 .type	bn_sqr8x_mont,\@function,6
 .align	32
 bn_sqr8x_mont:
-.Lsqr8x_enter:
+.cfi_startproc
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lsqr8x_enter:
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lsqr8x_prologue:
 
 	mov	${num}d,%r10d
 	shl	\$3,${num}d		# convert $num to bytes
@@ -762,30 +849,49 @@
 	# do its job.
 	#
 	lea	-64(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
 	mov	($n0),$n0		# *n0
 	sub	$aptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lsqr8x_sp_alt
-	sub	%r11,%rsp		# align with $aptr
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	sub	%r11,%rbp		# align with $aptr
+	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
 	jmp	.Lsqr8x_sp_done
 
 .align	32
 .Lsqr8x_sp_alt:
 	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
-	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
-	sub	%r11,%rsp
+	sub	%r11,%rbp
 .Lsqr8x_sp_done:
-	and	\$-64,%rsp
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+	jmp	.Lsqr8x_page_walk_done
+
+.align	16
+.Lsqr8x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
 	mov	$num,%r10
 	neg	$num
 
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
 .Lsqr8x_body:
 
 	movq	$nptr, %xmm2		# save pointer to modulus
@@ -855,6 +961,7 @@
 	pxor	%xmm0,%xmm0
 	pshufd	\$0,%xmm1,%xmm1
 	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	jmp	.Lsqr8x_cond_copy
 
 .align	32
@@ -884,14 +991,22 @@
 
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lsqr8x_epilogue:
 	ret
+.cfi_endproc
 .size	bn_sqr8x_mont,.-bn_sqr8x_mont
 ___
 }}}
@@ -903,23 +1018,48 @@
 .type	bn_mulx4x_mont,\@function,6
 .align	32
 bn_mulx4x_mont:
-.Lmulx4x_enter:
+.cfi_startproc
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmulx4x_enter:
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lmulx4x_prologue:
 
 	shl	\$3,${num}d		# convert $num to bytes
-	.byte	0x67
 	xor	%r10,%r10
 	sub	$num,%r10		# -$num
 	mov	($n0),$n0		# *n0
-	lea	-72(%rsp,%r10),%rsp	# alloca(frame+$num+8)
+	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
+	and	\$-128,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.align	16
+.Lmulx4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
 	lea	($bp,$num),%r10
-	and	\$-128,%rsp
 	##############################################################
 	# Stack layout
 	# +0	num
@@ -939,6 +1079,7 @@
 	mov	$n0, 24(%rsp)		# save *n0
 	mov	$rp, 32(%rsp)		# save $rp
 	mov	%rax,40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
 	mov	$num,48(%rsp)		# inner counter
 	jmp	.Lmulx4x_body
 
@@ -1188,6 +1329,7 @@
 	pxor	%xmm0,%xmm0
 	pshufd	\$0,%xmm1,%xmm1
 	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	jmp	.Lmulx4x_cond_copy
 
 .align	32
@@ -1217,14 +1359,22 @@
 
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmulx4x_epilogue:
 	ret
+.cfi_endproc
 .size	bn_mulx4x_mont,.-bn_mulx4x_mont
 ___
 }}}
@@ -1277,22 +1427,8 @@
 
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
-	lea	48(%rax),%rax
 
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R15
-
-	jmp	.Lcommon_seh_tail
+	jmp	.Lcommon_pop_regs
 .size	mul_handler,.-mul_handler
 
 .type	sqr_handler,\@abi-omnipotent
@@ -1317,18 +1453,24 @@
 
 	mov	0(%r11),%r10d		# HandlerData[0]
 	lea	(%rsi,%r10),%r10	# end of prologue label
-	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
 	jb	.Lcommon_seh_tail
 
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# body label
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_pop_regs
+
 	mov	152($context),%rax	# pull context->Rsp
 
-	mov	4(%r11),%r10d		# HandlerData[1]
+	mov	8(%r11),%r10d		# HandlerData[2]
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
 	jae	.Lcommon_seh_tail
 
 	mov	40(%rax),%rax		# pull saved stack pointer
 
+.Lcommon_pop_regs:
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
@@ -1415,13 +1557,15 @@
 .LSEH_info_bn_sqr8x_mont:
 	.byte	9,0,0,0
 	.rva	sqr_handler
-	.rva	.Lsqr8x_body,.Lsqr8x_epilogue	# HandlerData[]
+	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
+.align	8
 ___
 $code.=<<___ if ($addx);
 .LSEH_info_bn_mulx4x_mont:
 	.byte	9,0,0,0
 	.rva	sqr_handler
-	.rva	.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
+	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
+.align	8
 ___
 }
 
diff --git a/src/crypto/bn/asm/x86_64-mont5.pl b/src/crypto/bn/asm/x86_64-mont5.pl
index 61fde2d..bf68aad 100755
--- a/src/crypto/bn/asm/x86_64-mont5.pl
+++ b/src/crypto/bn/asm/x86_64-mont5.pl
@@ -73,6 +73,10 @@
 .type	bn_mul_mont_gather5,\@function,6
 .align	64
 bn_mul_mont_gather5:
+.cfi_startproc
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	test	\$7,${num}d
 	jnz	.Lmul_enter
 ___
@@ -84,24 +88,54 @@
 
 .align	16
 .Lmul_enter:
-	mov	${num}d,${num}d
-	mov	%rsp,%rax
 	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
-	lea	.Linc(%rip),%r10
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
 
-	lea	2($num),%r11
-	neg	%r11
-	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
-	and	\$-1024,%rsp		# minimize TLB usage
+	neg	$num
+	mov	%rsp,%r11
+	lea	-280(%rsp,$num,8),%r10	# future alloca(8*(num+2)+256+8)
+	neg	$num			# restore $num
+	and	\$-1024,%r10		# minimize TLB usage
 
+	# An OS-agnostic version of __chkstk.
+	#
+	# Some OSes (Windows) insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	sub	%r10,%r11
+	and	\$-4096,%r11
+	lea	(%r10,%r11),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul_page_walk
+	jmp	.Lmul_page_walk_done
+
+.Lmul_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r11
+	cmp	%r10,%rsp
+	ja	.Lmul_page_walk
+.Lmul_page_walk_done:
+
+	lea	.Linc(%rip),%r10
 	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
 .Lmul_body:
+
 	lea	128($bp),%r12		# reassign $bp (+size optimization)
 ___
 		$bp="%r12";
@@ -370,32 +404,42 @@
 
 	sbb	\$0,%rax		# handle upmost overflow bit
 	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
 	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	(%rsp,$i,8),$ap
-	mov	($rp,$i,8),$np
-	xor	$np,$ap			# conditional select:
-	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
-	xor	$np,$ap			# ap = borrow?tp:rp
+	mov	($ap,$i,8),%rax
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy
 
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+.cfi_def_cfa	%rsi,8
 	mov	\$1,%rax
 
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul_epilogue:
 	ret
+.cfi_endproc
 .size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
 ___
 {{{
@@ -405,6 +449,10 @@
 .type	bn_mul4x_mont_gather5,\@function,6
 .align	32
 bn_mul4x_mont_gather5:
+.cfi_startproc
+	.byte	0x67
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 .Lmul4x_enter:
 ___
 $code.=<<___ if ($addx);
@@ -413,14 +461,19 @@
 	je	.Lmulx4x_enter
 ___
 $code.=<<___;
-	.byte	0x67
-	mov	%rsp,%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lmul4x_prologue:
 
 	.byte	0x67
 	shl	\$3,${num}d		# convert $num to bytes
@@ -437,43 +490,70 @@
 	# calculated from 7th argument, the index.]
 	#
 	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
 	sub	$rp,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lmul4xsp_alt
-	sub	%r11,%rsp		# align with $rp
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
+	sub	%r11,%rbp		# align with $rp
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
 	jmp	.Lmul4xsp_done
 
 .align	32
 .Lmul4xsp_alt:
 	lea	4096-320(,$num,2),%r10
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
-	sub	%r11,%rsp
+	sub	%r11,%rbp
 .Lmul4xsp_done:
-	and	\$-64,%rsp
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+	jmp	.Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
 	neg	$num
 
 	mov	%rax,40(%rsp)
+.cfi_cfa_expression	%rsp+40,deref,+8
 .Lmul4x_body:
 
 	call	mul4x_internal
 
 	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	mov	\$1,%rax
 
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmul4x_epilogue:
 	ret
+.cfi_endproc
 .size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
 
 .type	mul4x_internal,\@abi-omnipotent
@@ -985,7 +1065,7 @@
 my $nptr="%rcx";	# const BN_ULONG *nptr,
 my $n0  ="%r8";		# const BN_ULONG *n0);
 my $num ="%r9";		# int num, has to be divisible by 8
-			# int pwr 
+			# int pwr
 
 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
 my @A0=("%r10","%r11");
@@ -997,6 +1077,9 @@
 .type	bn_power5,\@function,6
 .align	32
 bn_power5:
+.cfi_startproc
+	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 ___
 $code.=<<___ if ($addx);
 	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
@@ -1005,13 +1088,19 @@
 	je	.Lpowerx5_enter
 ___
 $code.=<<___;
-	mov	%rsp,%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lpower5_prologue:
 
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10d	# 3*$num
@@ -1026,25 +1115,42 @@
 	# calculated from 7th argument, the index.]
 	#
 	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
 	sub	$rptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lpwr_sp_alt
-	sub	%r11,%rsp		# align with $aptr
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
 	jmp	.Lpwr_sp_done
 
 .align	32
 .Lpwr_sp_alt:
 	lea	4096-320(,$num,2),%r10
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
-	sub	%r11,%rsp
+	sub	%r11,%rbp
 .Lpwr_sp_done:
-	and	\$-64,%rsp
-	mov	$num,%r10	
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwr_page_walk
+	jmp	.Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+	mov	$num,%r10
 	neg	$num
 
 	##############################################################
@@ -1058,6 +1164,7 @@
 	#
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
 .Lpower5_body:
 	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
 	movq	$nptr,%xmm2		# save $nptr
@@ -1084,16 +1191,25 @@
 	call	mul4x_internal
 
 	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lpower5_epilogue:
 	ret
+.cfi_endproc
 .size	bn_power5,.-bn_power5
 
 .globl	bn_sqr8x_internal
@@ -1953,7 +2069,7 @@
 	jnz	.Lsqr4x_sub
 
 	mov	$num,%r10		# prepare for back-to-back call
-	neg	$num			# restore $num	
+	neg	$num			# restore $num
 	ret
 .size	__bn_post4x_internal,.-__bn_post4x_internal
 ___
@@ -1973,14 +2089,23 @@
 .type	bn_from_mont8x,\@function,6
 .align	32
 bn_from_mont8x:
+.cfi_startproc
 	.byte	0x67
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lfrom_prologue:
 
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10	# 3*$num in bytes
@@ -1995,25 +2120,42 @@
 	# last operation, we use the opportunity to cleanse it.
 	#
 	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
 	sub	$rptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lfrom_sp_alt
-	sub	%r11,%rsp		# align with $aptr
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
 	jmp	.Lfrom_sp_done
 
 .align	32
 .Lfrom_sp_alt:
 	lea	4096-320(,$num,2),%r10
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
-	sub	%r11,%rsp
+	sub	%r11,%rbp
 .Lfrom_sp_done:
-	and	\$-64,%rsp
-	mov	$num,%r10	
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lfrom_page_walk
+	jmp	.Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lfrom_page_walk
+.Lfrom_page_walk_done:
+
+	mov	$num,%r10
 	neg	$num
 
 	##############################################################
@@ -2027,6 +2169,7 @@
 	#
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
 .Lfrom_body:
 	mov	$num,%r11
 	lea	48(%rsp),%rax
@@ -2070,7 +2213,6 @@
 
 	pxor	%xmm0,%xmm0
 	lea	48(%rsp),%rax
-	mov	40(%rsp),%rsi		# restore %rsp
 	jmp	.Lfrom_mont_zero
 
 .align	32
@@ -2082,11 +2224,12 @@
 
 	pxor	%xmm0,%xmm0
 	lea	48(%rsp),%rax
-	mov	40(%rsp),%rsi		# restore %rsp
 	jmp	.Lfrom_mont_zero
 
 .align	32
 .Lfrom_mont_zero:
+	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	movdqa	%xmm0,16*0(%rax)
 	movdqa	%xmm0,16*1(%rax)
 	movdqa	%xmm0,16*2(%rax)
@@ -2097,14 +2240,22 @@
 
 	mov	\$1,%rax
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lfrom_epilogue:
 	ret
+.cfi_endproc
 .size	bn_from_mont8x,.-bn_from_mont8x
 ___
 }
@@ -2117,14 +2268,23 @@
 .type	bn_mulx4x_mont_gather5,\@function,6
 .align	32
 bn_mulx4x_mont_gather5:
-.Lmulx4x_enter:
+.cfi_startproc
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lmulx4x_enter:
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lmulx4x_prologue:
 
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10	# 3*$num in bytes
@@ -2141,23 +2301,40 @@
 	# calculated from 7th argument, the index.]
 	#
 	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
 	sub	$rp,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lmulx4xsp_alt
-	sub	%r11,%rsp		# align with $aptr
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
 	jmp	.Lmulx4xsp_done
 
 .Lmulx4xsp_alt:
 	lea	4096-320(,$num,2),%r10
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
-	sub	%r11,%rsp
-.Lmulx4xsp_done:	
-	and	\$-64,%rsp		# ensure alignment
+	sub	%r11,%rbp
+.Lmulx4xsp_done:
+	and	\$-64,%rbp		# ensure alignment
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+	jmp	.Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
 	##############################################################
 	# Stack layout
 	# +0	-num
@@ -2172,21 +2349,31 @@
 	#
 	mov	$n0, 32(%rsp)		# save *n0
 	mov	%rax,40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
 .Lmulx4x_body:
 	call	mulx4x_internal
 
 	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	mov	\$1,%rax
 
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lmulx4x_epilogue:
 	ret
+.cfi_endproc
 .size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
 
 .type	mulx4x_internal,\@abi-omnipotent
@@ -2564,14 +2751,23 @@
 .type	bn_powerx5,\@function,6
 .align	32
 bn_powerx5:
-.Lpowerx5_enter:
+.cfi_startproc
 	mov	%rsp,%rax
+.cfi_def_cfa_register	%rax
+.Lpowerx5_enter:
 	push	%rbx
+.cfi_push	%rbx
 	push	%rbp
+.cfi_push	%rbp
 	push	%r12
+.cfi_push	%r12
 	push	%r13
+.cfi_push	%r13
 	push	%r14
+.cfi_push	%r14
 	push	%r15
+.cfi_push	%r15
+.Lpowerx5_prologue:
 
 	shl	\$3,${num}d		# convert $num to bytes
 	lea	($num,$num,2),%r10	# 3*$num in bytes
@@ -2586,25 +2782,42 @@
 	# calculated from 7th argument, the index.]
 	#
 	lea	-320(%rsp,$num,2),%r11
+	mov	%rsp,%rbp
 	sub	$rptr,%r11
 	and	\$4095,%r11
 	cmp	%r11,%r10
 	jb	.Lpwrx_sp_alt
-	sub	%r11,%rsp		# align with $aptr
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
+	sub	%r11,%rbp		# align with $aptr
+	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
 	jmp	.Lpwrx_sp_done
 
 .align	32
 .Lpwrx_sp_alt:
 	lea	4096-320(,$num,2),%r10
-	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
+	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
 	sub	%r10,%r11
 	mov	\$0,%r10
 	cmovc	%r10,%r11
-	sub	%r11,%rsp
+	sub	%r11,%rbp
 .Lpwrx_sp_done:
-	and	\$-64,%rsp
-	mov	$num,%r10	
+	and	\$-64,%rbp
+	mov	%rsp,%r11
+	sub	%rbp,%r11
+	and	\$-4096,%r11
+	lea	(%rbp,%r11),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+	jmp	.Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+	lea	-4096(%rsp),%rsp
+	mov	(%rsp),%r10
+	cmp	%rbp,%rsp
+	ja	.Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+	mov	$num,%r10
 	neg	$num
 
 	##############################################################
@@ -2625,6 +2838,7 @@
 	movq	$bptr,%xmm4
 	mov	$n0,  32(%rsp)
 	mov	%rax, 40(%rsp)		# save original %rsp
+.cfi_cfa_expression	%rsp+40,deref,+8
 .Lpowerx5_body:
 
 	call	__bn_sqrx8x_internal
@@ -2647,17 +2861,26 @@
 	call	mulx4x_internal
 
 	mov	40(%rsp),%rsi		# restore %rsp
+.cfi_def_cfa	%rsi,8
 	mov	\$1,%rax
 
 	mov	-48(%rsi),%r15
+.cfi_restore	%r15
 	mov	-40(%rsi),%r14
+.cfi_restore	%r14
 	mov	-32(%rsi),%r13
+.cfi_restore	%r13
 	mov	-24(%rsi),%r12
+.cfi_restore	%r12
 	mov	-16(%rsi),%rbp
+.cfi_restore	%rbp
 	mov	-8(%rsi),%rbx
+.cfi_restore	%rbx
 	lea	(%rsi),%rsp
+.cfi_def_cfa_register	%rsp
 .Lpowerx5_epilogue:
 	ret
+.cfi_endproc
 .size	bn_powerx5,.-bn_powerx5
 
 .globl	bn_sqrx8x_internal
@@ -3513,9 +3736,14 @@
 	cmp	%r10,%rbx		# context->Rip<end of prologue label
 	jb	.Lcommon_seh_tail
 
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# beginning of body label
+	cmp	%r10,%rbx		# context->Rip<body label
+	jb	.Lcommon_pop_regs
+
 	mov	152($context),%rax	# pull context->Rsp
 
-	mov	4(%r11),%r10d		# HandlerData[1]
+	mov	8(%r11),%r10d		# HandlerData[2]
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
@@ -3527,11 +3755,11 @@
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
 
-	jmp	.Lbody_proceed
+	jmp	.Lcommon_pop_regs
 
 .Lbody_40:
 	mov	40(%rax),%rax		# pull saved stack pointer
-.Lbody_proceed:
+.Lcommon_pop_regs:
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
@@ -3622,34 +3850,34 @@
 .LSEH_info_bn_mul_mont_gather5:
 	.byte	9,0,0,0
 	.rva	mul_handler
-	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
+	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
 .align	8
 .LSEH_info_bn_mul4x_mont_gather5:
 	.byte	9,0,0,0
 	.rva	mul_handler
-	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
+	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
 .align	8
 .LSEH_info_bn_power5:
 	.byte	9,0,0,0
 	.rva	mul_handler
-	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
+	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
 .align	8
 .LSEH_info_bn_from_mont8x:
 	.byte	9,0,0,0
 	.rva	mul_handler
-	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
+	.rva	.Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
 ___
 $code.=<<___ if ($addx);
 .align	8
 .LSEH_info_bn_mulx4x_mont_gather5:
 	.byte	9,0,0,0
 	.rva	mul_handler
-	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
+	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
 .align	8
 .LSEH_info_bn_powerx5:
 	.byte	9,0,0,0
 	.rva	mul_handler
-	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
+	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
 ___
 $code.=<<___;
 .align	8
diff --git a/src/crypto/chacha/CMakeLists.txt b/src/crypto/chacha/CMakeLists.txt
index 39d1def..63de061 100644
--- a/src/crypto/chacha/CMakeLists.txt
+++ b/src/crypto/chacha/CMakeLists.txt
@@ -42,17 +42,7 @@
   ${CHACHA_ARCH_SOURCES}
 )
 
-add_executable(
-  chacha_test
-
-  chacha_test.cc
-  $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(chacha_test crypto)
-add_dependencies(all_tests chacha_test)
-
 perlasm(chacha-armv4.${ASM_EXT} asm/chacha-armv4.pl)
 perlasm(chacha-armv8.${ASM_EXT} asm/chacha-armv8.pl)
 perlasm(chacha-x86.${ASM_EXT} asm/chacha-x86.pl)
-perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl)
\ No newline at end of file
+perlasm(chacha-x86_64.${ASM_EXT} asm/chacha-x86_64.pl)
diff --git a/src/crypto/chacha/asm/chacha-armv4.pl b/src/crypto/chacha/asm/chacha-armv4.pl
index 395b815..13698e3 100755
--- a/src/crypto/chacha/asm/chacha-armv4.pl
+++ b/src/crypto/chacha/asm/chacha-armv4.pl
@@ -8,7 +8,7 @@
 # ====================================================================
 #
 # December 2014
-# 
+#
 # ChaCha20 for ARMv4.
 #
 # Performance in cycles per byte out of large buffer.
@@ -713,7 +713,7 @@
 	vadd.i32	$d2,$d1,$t0		@ counter+2
 	str		@t[3], [sp,#4*(16+15)]
 	mov		@t[3],#10
-	add		@x[12],@x[12],#3	@ counter+3 
+	add		@x[12],@x[12],#3	@ counter+3
 	b		.Loop_neon
 
 .align	4
@@ -1127,7 +1127,7 @@
 	ldrb		@t[1],[r12],#1		@ read input
 	subs		@t[3],@t[3],#1
 	eor		@t[0],@t[0],@t[1]
-	strb		@t[0],[r14],#1		@ store ouput
+	strb		@t[0],[r14],#1		@ store output
 	bne		.Loop_tail_neon
 
 .Ldone_neon:
diff --git a/src/crypto/chacha/asm/chacha-armv8.pl b/src/crypto/chacha/asm/chacha-armv8.pl
index 215d965..c2d0429 100755
--- a/src/crypto/chacha/asm/chacha-armv8.pl
+++ b/src/crypto/chacha/asm/chacha-armv8.pl
@@ -8,7 +8,7 @@
 # ====================================================================
 #
 # June 2015
-# 
+#
 # ChaCha20 for ARMv8.
 #
 # Performance in cycles per byte out of large buffer.
@@ -193,7 +193,7 @@
 	mov	$ctr,#10
 	subs	$len,$len,#64
 .Loop:
-	sub	$ctr,$ctr,#1	
+	sub	$ctr,$ctr,#1
 ___
 	foreach (&ROUND(0, 4, 8,12)) { eval; }
 	foreach (&ROUND(0, 5,10,15)) { eval; }
diff --git a/src/crypto/chacha/asm/chacha-x86.pl b/src/crypto/chacha/asm/chacha-x86.pl
index 984ce11..f8bbb76 100755
--- a/src/crypto/chacha/asm/chacha-x86.pl
+++ b/src/crypto/chacha/asm/chacha-x86.pl
@@ -21,7 +21,9 @@
 # Westmere	9.50/+45%	3.35
 # Sandy Bridge	10.5/+47%	3.20
 # Haswell	8.15/+50%	2.83
+# Skylake	7.53/+22%	2.75
 # Silvermont	17.4/+36%	8.35
+# Goldmont	13.4/+40%	4.36
 # Sledgehammer	10.2/+54%
 # Bulldozer	13.4/+50%	4.38(*)
 #
@@ -38,10 +40,8 @@
 
 &asm_init($ARGV[0],"chacha-x86.pl",$ARGV[$#ARGV] eq "386");
 
-$xmm=$ymm=0;
-for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-$ymm=$xmm;
+$xmm=$ymm=1;
+$gasver=999;  # enable everything
 
 $a="eax";
 ($b,$b_)=("ebx","ebp");
@@ -438,6 +438,12 @@
 				    &label("pic_point"),"eax"));
 	&movdqu		("xmm3",&QWP(0,"ebx"));		# counter and nonce
 
+if (defined($gasver) && $gasver>=2.17) {		# even though we encode
+							# pshufb manually, we
+							# handle only register
+							# operands, while this
+							# segment uses memory
+							# operand...
 	&cmp		($len,64*4);
 	&jb		(&label("1x"));
 
@@ -619,6 +625,7 @@
 	&paddd		("xmm2",&QWP(16*6,"eax"));	# +four
 	&pand		("xmm3",&QWP(16*7,"eax"));
 	&por		("xmm3","xmm2");		# counter value
+}
 {
 my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7));
 
diff --git a/src/crypto/chacha/asm/chacha-x86_64.pl b/src/crypto/chacha/asm/chacha-x86_64.pl
index 55b726d..5ab6f87 100755
--- a/src/crypto/chacha/asm/chacha-x86_64.pl
+++ b/src/crypto/chacha/asm/chacha-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -11,6 +18,10 @@
 #
 # ChaCha20 for x86_64.
 #
+# December 2016
+#
+# Add AVX512F code path.
+#
 # Performance in cycles per byte out of large buffer.
 #
 #		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    8xAVX2
@@ -21,7 +32,9 @@
 # Sandy Bridge	8.31/+42%	5.45/6.76	2.72
 # Ivy Bridge	6.71/+46%	5.40/6.49	2.41
 # Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
+# Skylake	5.87/+39%	4.70/-		2.31	    1.19
 # Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
+# Goldmont	10.6/+17%	5.10/-		3.28
 # Sledgehammer	7.28/+52%	-/14.2(ii)	-
 # Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
 # VIA Nano	10.5/+46%	6.72/8.60	6.05
@@ -82,6 +95,15 @@
 .byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
 .Lsigma:
 .asciz	"expand 32-byte k"
+.align	64
+.Lzeroz:
+.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
 .asciz	"ChaCha20 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
@@ -207,6 +229,12 @@
 	cmp	\$0,$len
 	je	.Lno_data
 	mov	OPENSSL_ia32cap_P+4(%rip),%r10
+___
+$code.=<<___	if ($avx>2);
+	bt	\$48,%r10		# check for AVX512F
+	jc	.LChaCha20_avx512
+___
+$code.=<<___;
 	test	\$`1<<(41-32)`,%r10d
 	jnz	.LChaCha20_ssse3
 
@@ -217,6 +245,7 @@
 	push	%r14
 	push	%r15
 	sub	\$64+24,%rsp
+.Lctr32_body:
 
 	#movdqa	.Lsigma(%rip),%xmm0
 	movdqu	($key),%xmm1
@@ -355,13 +384,14 @@
 	jnz	.Loop_tail
 
 .Ldone:
-	add	\$64+24,%rsp
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
+	lea	64+24+48(%rsp),%rsi
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lno_data:
 	ret
 .size	ChaCha20_ctr32,.-ChaCha20_ctr32
@@ -396,31 +426,26 @@
 	&por	($b,$t);
 }
 
-my $xframe = $win64 ? 32+32+8 : 24;
+my $xframe = $win64 ? 32+8 : 8;
 
 $code.=<<___;
 .type	ChaCha20_ssse3,\@function,5
 .align	32
 ChaCha20_ssse3:
 .LChaCha20_ssse3:
+	mov	%rsp,%r9		# frame pointer
 ___
 $code.=<<___;
 	cmp	\$128,$len		# we might throw away some data,
 	ja	.LChaCha20_4x		# but overall it won't be slower
 
 .Ldo_sse3_after_all:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-
 	sub	\$64+$xframe,%rsp
 ___
 $code.=<<___	if ($win64);
-	movaps	%xmm6,64+32(%rsp)
-	movaps	%xmm7,64+48(%rsp)
+	movaps	%xmm6,-0x28(%r9)
+	movaps	%xmm7,-0x18(%r9)
+.Lssse3_body:
 ___
 $code.=<<___;
 	movdqa	.Lsigma(%rip),$a
@@ -434,7 +459,7 @@
 	movdqa	$b,0x10(%rsp)
 	movdqa	$c,0x20(%rsp)
 	movdqa	$d,0x30(%rsp)
-	mov	\$10,%ebp
+	mov	\$10,$counter		# reuse $counter
 	jmp	.Loop_ssse3
 
 .align	32
@@ -444,7 +469,7 @@
 	movdqa	0x10(%rsp),$b
 	movdqa	0x20(%rsp),$c
 	paddd	0x30(%rsp),$d
-	mov	\$10,%ebp
+	mov	\$10,$counter
 	movdqa	$d,0x30(%rsp)
 	jmp	.Loop_ssse3
 
@@ -462,7 +487,7 @@
 	&pshufd	($b,$b,0b10010011);
 	&pshufd	($d,$d,0b00111001);
 
-	&dec	("%ebp");
+	&dec	($counter);
 	&jnz	(".Loop_ssse3");
 
 $code.=<<___;
@@ -501,31 +526,26 @@
 	movdqa	$b,0x10(%rsp)
 	movdqa	$c,0x20(%rsp)
 	movdqa	$d,0x30(%rsp)
-	xor	%rbx,%rbx
+	xor	$counter,$counter
 
 .Loop_tail_ssse3:
-	movzb	($inp,%rbx),%eax
-	movzb	(%rsp,%rbx),%ecx
-	lea	1(%rbx),%rbx
+	movzb	($inp,$counter),%eax
+	movzb	(%rsp,$counter),%ecx
+	lea	1($counter),$counter
 	xor	%ecx,%eax
-	mov	%al,-1($out,%rbx)
+	mov	%al,-1($out,$counter)
 	dec	$len
 	jnz	.Loop_tail_ssse3
 
 .Ldone_ssse3:
 ___
 $code.=<<___	if ($win64);
-	movaps	64+32(%rsp),%xmm6
-	movaps	64+48(%rsp),%xmm7
+	movaps	-0x28(%r9),%xmm6
+	movaps	-0x18(%r9),%xmm7
 ___
 $code.=<<___;
-	add	\$64+$xframe,%rsp
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
+	lea	(%r9),%rsp
+.Lssse3_epilogue:
 	ret
 .size	ChaCha20_ssse3,.-ChaCha20_ssse3
 ___
@@ -662,13 +682,14 @@
 	);
 }
 
-my $xframe = $win64 ? 0xa0 : 0;
+my $xframe = $win64 ? 0xa8 : 8;
 
 $code.=<<___;
 .type	ChaCha20_4x,\@function,5
 .align	32
 ChaCha20_4x:
 .LChaCha20_4x:
+	mov		%rsp,%r9		# frame pointer
 	mov		%r10,%r11
 ___
 $code.=<<___	if ($avx>1);
@@ -685,8 +706,7 @@
 	je		.Ldo_sse3_after_all	# to detect Atom
 
 .Lproceed4x:
-	lea		-0x78(%rsp),%r11
-	sub		\$0x148+$xframe,%rsp
+	sub		\$0x140+$xframe,%rsp
 ___
 	################ stack layout
 	# +0x00		SIMD equivalent of @x[8-12]
@@ -697,16 +717,17 @@
 	# ...
 	# +0x140
 $code.=<<___	if ($win64);
-	movaps		%xmm6,-0x30(%r11)
-	movaps		%xmm7,-0x20(%r11)
-	movaps		%xmm8,-0x10(%r11)
-	movaps		%xmm9,0x00(%r11)
-	movaps		%xmm10,0x10(%r11)
-	movaps		%xmm11,0x20(%r11)
-	movaps		%xmm12,0x30(%r11)
-	movaps		%xmm13,0x40(%r11)
-	movaps		%xmm14,0x50(%r11)
-	movaps		%xmm15,0x60(%r11)
+	movaps		%xmm6,-0xa8(%r9)
+	movaps		%xmm7,-0x98(%r9)
+	movaps		%xmm8,-0x88(%r9)
+	movaps		%xmm9,-0x78(%r9)
+	movaps		%xmm10,-0x68(%r9)
+	movaps		%xmm11,-0x58(%r9)
+	movaps		%xmm12,-0x48(%r9)
+	movaps		%xmm13,-0x38(%r9)
+	movaps		%xmm14,-0x28(%r9)
+	movaps		%xmm15,-0x18(%r9)
+.L4x_body:
 ___
 $code.=<<___;
 	movdqa		.Lsigma(%rip),$xa3	# key[0]
@@ -1095,20 +1116,20 @@
 .Ldone4x:
 ___
 $code.=<<___	if ($win64);
-	lea		0x140+0x30(%rsp),%r11
-	movaps		-0x30(%r11),%xmm6
-	movaps		-0x20(%r11),%xmm7
-	movaps		-0x10(%r11),%xmm8
-	movaps		0x00(%r11),%xmm9
-	movaps		0x10(%r11),%xmm10
-	movaps		0x20(%r11),%xmm11
-	movaps		0x30(%r11),%xmm12
-	movaps		0x40(%r11),%xmm13
-	movaps		0x50(%r11),%xmm14
-	movaps		0x60(%r11),%xmm15
+	movaps		-0xa8(%r9),%xmm6
+	movaps		-0x98(%r9),%xmm7
+	movaps		-0x88(%r9),%xmm8
+	movaps		-0x78(%r9),%xmm9
+	movaps		-0x68(%r9),%xmm10
+	movaps		-0x58(%r9),%xmm11
+	movaps		-0x48(%r9),%xmm12
+	movaps		-0x38(%r9),%xmm13
+	movaps		-0x28(%r9),%xmm14
+	movaps		-0x18(%r9),%xmm15
 ___
 $code.=<<___;
-	add		\$0x148+$xframe,%rsp
+	lea		(%r9),%rsp
+.L4x_epilogue:
 	ret
 .size	ChaCha20_4x,.-ChaCha20_4x
 ___
@@ -1236,33 +1257,32 @@
 	);
 }
 
-my $xframe = $win64 ? 0xb0 : 8;
+my $xframe = $win64 ? 0xa8 : 8;
 
 $code.=<<___;
 .type	ChaCha20_8x,\@function,5
 .align	32
 ChaCha20_8x:
 .LChaCha20_8x:
-	mov		%rsp,%r10
+	mov		%rsp,%r9		# frame register
 	sub		\$0x280+$xframe,%rsp
 	and		\$-32,%rsp
 ___
 $code.=<<___	if ($win64);
-	lea		0x290+0x30(%rsp),%r11
-	movaps		%xmm6,-0x30(%r11)
-	movaps		%xmm7,-0x20(%r11)
-	movaps		%xmm8,-0x10(%r11)
-	movaps		%xmm9,0x00(%r11)
-	movaps		%xmm10,0x10(%r11)
-	movaps		%xmm11,0x20(%r11)
-	movaps		%xmm12,0x30(%r11)
-	movaps		%xmm13,0x40(%r11)
-	movaps		%xmm14,0x50(%r11)
-	movaps		%xmm15,0x60(%r11)
+	movaps		%xmm6,-0xa8(%r9)
+	movaps		%xmm7,-0x98(%r9)
+	movaps		%xmm8,-0x88(%r9)
+	movaps		%xmm9,-0x78(%r9)
+	movaps		%xmm10,-0x68(%r9)
+	movaps		%xmm11,-0x58(%r9)
+	movaps		%xmm12,-0x48(%r9)
+	movaps		%xmm13,-0x38(%r9)
+	movaps		%xmm14,-0x28(%r9)
+	movaps		%xmm15,-0x18(%r9)
+.L8x_body:
 ___
 $code.=<<___;
 	vzeroupper
-	mov		%r10,0x280(%rsp)
 
 	################ stack layout
 	# +0x00		SIMD equivalent of @x[8-12]
@@ -1271,7 +1291,7 @@
 	# ...
 	# +0x200	SIMD counters (with nonce smashed by lanes)
 	# ...
-	# +0x280	saved %rsp
+	# +0x280
 
 	vbroadcasti128	.Lsigma(%rip),$xa3	# key[0]
 	vbroadcasti128	($key),$xb3		# key[1]
@@ -1737,29 +1757,989 @@
 	vzeroall
 ___
 $code.=<<___	if ($win64);
-	lea		0x290+0x30(%rsp),%r11
-	movaps		-0x30(%r11),%xmm6
-	movaps		-0x20(%r11),%xmm7
-	movaps		-0x10(%r11),%xmm8
-	movaps		0x00(%r11),%xmm9
-	movaps		0x10(%r11),%xmm10
-	movaps		0x20(%r11),%xmm11
-	movaps		0x30(%r11),%xmm12
-	movaps		0x40(%r11),%xmm13
-	movaps		0x50(%r11),%xmm14
-	movaps		0x60(%r11),%xmm15
+	movaps		-0xa8(%r9),%xmm6
+	movaps		-0x98(%r9),%xmm7
+	movaps		-0x88(%r9),%xmm8
+	movaps		-0x78(%r9),%xmm9
+	movaps		-0x68(%r9),%xmm10
+	movaps		-0x58(%r9),%xmm11
+	movaps		-0x48(%r9),%xmm12
+	movaps		-0x38(%r9),%xmm13
+	movaps		-0x28(%r9),%xmm14
+	movaps		-0x18(%r9),%xmm15
 ___
 $code.=<<___;
-	mov		0x280(%rsp),%rsp
+	lea		(%r9),%rsp
+.L8x_epilogue:
 	ret
 .size	ChaCha20_8x,.-ChaCha20_8x
 ___
 }
 
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
+########################################################################
+# AVX512 code paths
+if ($avx>2) {
+# This one handles shorter inputs...
 
-	s/%x#%y/%x/go;
+my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
+my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
+
+sub AVX512ROUND {	# critical path is 14 "SIMD ticks" per round
+	&vpaddd	($a,$a,$b);
+	&vpxord	($d,$d,$a);
+	&vprold	($d,$d,16);
+
+	&vpaddd	($c,$c,$d);
+	&vpxord	($b,$b,$c);
+	&vprold	($b,$b,12);
+
+	&vpaddd	($a,$a,$b);
+	&vpxord	($d,$d,$a);
+	&vprold	($d,$d,8);
+
+	&vpaddd	($c,$c,$d);
+	&vpxord	($b,$b,$c);
+	&vprold	($b,$b,7);
+}
+
+my $xframe = $win64 ? 32+8 : 8;
+
+$code.=<<___;
+.type	ChaCha20_avx512,\@function,5
+.align	32
+ChaCha20_avx512:
+.LChaCha20_avx512:
+	mov	%rsp,%r9		# frame pointer
+	cmp	\$512,$len
+	ja	.LChaCha20_16x
+
+	sub	\$64+$xframe,%rsp
+___
+$code.=<<___	if ($win64);
+	movaps	%xmm6,-0x28(%r9)
+	movaps	%xmm7,-0x18(%r9)
+.Lavx512_body:
+___
+$code.=<<___;
+	vbroadcasti32x4	.Lsigma(%rip),$a
+	vbroadcasti32x4	($key),$b
+	vbroadcasti32x4	16($key),$c
+	vbroadcasti32x4	($counter),$d
+
+	vmovdqa32	$a,$a_
+	vmovdqa32	$b,$b_
+	vmovdqa32	$c,$c_
+	vpaddd		.Lzeroz(%rip),$d,$d
+	vmovdqa32	.Lfourz(%rip),$fourz
+	mov		\$10,$counter	# reuse $counter
+	vmovdqa32	$d,$d_
+	jmp		.Loop_avx512
+
+.align	16
+.Loop_outer_avx512:
+	vmovdqa32	$a_,$a
+	vmovdqa32	$b_,$b
+	vmovdqa32	$c_,$c
+	vpaddd		$fourz,$d_,$d
+	mov		\$10,$counter
+	vmovdqa32	$d,$d_
+	jmp		.Loop_avx512
+
+.align	32
+.Loop_avx512:
+___
+	&AVX512ROUND();
+	&vpshufd	($c,$c,0b01001110);
+	&vpshufd	($b,$b,0b00111001);
+	&vpshufd	($d,$d,0b10010011);
+
+	&AVX512ROUND();
+	&vpshufd	($c,$c,0b01001110);
+	&vpshufd	($b,$b,0b10010011);
+	&vpshufd	($d,$d,0b00111001);
+
+	&dec		($counter);
+	&jnz		(".Loop_avx512");
+
+$code.=<<___;
+	vpaddd		$a_,$a,$a
+	vpaddd		$b_,$b,$b
+	vpaddd		$c_,$c,$c
+	vpaddd		$d_,$d,$d
+
+	sub		\$64,$len
+	jb		.Ltail64_avx512
+
+	vpxor		0x00($inp),%x#$a,$t0	# xor with input
+	vpxor		0x10($inp),%x#$b,$t1
+	vpxor		0x20($inp),%x#$c,$t2
+	vpxor		0x30($inp),%x#$d,$t3
+	lea		0x40($inp),$inp		# inp+=64
+
+	vmovdqu		$t0,0x00($out)		# write output
+	vmovdqu		$t1,0x10($out)
+	vmovdqu		$t2,0x20($out)
+	vmovdqu		$t3,0x30($out)
+	lea		0x40($out),$out		# out+=64
+
+	jz		.Ldone_avx512
+
+	vextracti32x4	\$1,$a,$t0
+	vextracti32x4	\$1,$b,$t1
+	vextracti32x4	\$1,$c,$t2
+	vextracti32x4	\$1,$d,$t3
+
+	sub		\$64,$len
+	jb		.Ltail_avx512
+
+	vpxor		0x00($inp),$t0,$t0	# xor with input
+	vpxor		0x10($inp),$t1,$t1
+	vpxor		0x20($inp),$t2,$t2
+	vpxor		0x30($inp),$t3,$t3
+	lea		0x40($inp),$inp		# inp+=64
+
+	vmovdqu		$t0,0x00($out)		# write output
+	vmovdqu		$t1,0x10($out)
+	vmovdqu		$t2,0x20($out)
+	vmovdqu		$t3,0x30($out)
+	lea		0x40($out),$out		# out+=64
+
+	jz		.Ldone_avx512
+
+	vextracti32x4	\$2,$a,$t0
+	vextracti32x4	\$2,$b,$t1
+	vextracti32x4	\$2,$c,$t2
+	vextracti32x4	\$2,$d,$t3
+
+	sub		\$64,$len
+	jb		.Ltail_avx512
+
+	vpxor		0x00($inp),$t0,$t0	# xor with input
+	vpxor		0x10($inp),$t1,$t1
+	vpxor		0x20($inp),$t2,$t2
+	vpxor		0x30($inp),$t3,$t3
+	lea		0x40($inp),$inp		# inp+=64
+
+	vmovdqu		$t0,0x00($out)		# write output
+	vmovdqu		$t1,0x10($out)
+	vmovdqu		$t2,0x20($out)
+	vmovdqu		$t3,0x30($out)
+	lea		0x40($out),$out		# out+=64
+
+	jz		.Ldone_avx512
+
+	vextracti32x4	\$3,$a,$t0
+	vextracti32x4	\$3,$b,$t1
+	vextracti32x4	\$3,$c,$t2
+	vextracti32x4	\$3,$d,$t3
+
+	sub		\$64,$len
+	jb		.Ltail_avx512
+
+	vpxor		0x00($inp),$t0,$t0	# xor with input
+	vpxor		0x10($inp),$t1,$t1
+	vpxor		0x20($inp),$t2,$t2
+	vpxor		0x30($inp),$t3,$t3
+	lea		0x40($inp),$inp		# inp+=64
+
+	vmovdqu		$t0,0x00($out)		# write output
+	vmovdqu		$t1,0x10($out)
+	vmovdqu		$t2,0x20($out)
+	vmovdqu		$t3,0x30($out)
+	lea		0x40($out),$out		# out+=64
+
+	jnz		.Loop_outer_avx512
+
+	jmp		.Ldone_avx512
+
+.align	16
+.Ltail64_avx512:
+	vmovdqa		%x#$a,0x00(%rsp)
+	vmovdqa		%x#$b,0x10(%rsp)
+	vmovdqa		%x#$c,0x20(%rsp)
+	vmovdqa		%x#$d,0x30(%rsp)
+	add		\$64,$len
+	jmp		.Loop_tail_avx512
+
+.align	16
+.Ltail_avx512:
+	vmovdqa		$t0,0x00(%rsp)
+	vmovdqa		$t1,0x10(%rsp)
+	vmovdqa		$t2,0x20(%rsp)
+	vmovdqa		$t3,0x30(%rsp)
+	add		\$64,$len
+
+.Loop_tail_avx512:
+	movzb		($inp,$counter),%eax
+	movzb		(%rsp,$counter),%ecx
+	lea		1($counter),$counter
+	xor		%ecx,%eax
+	mov		%al,-1($out,$counter)
+	dec		$len
+	jnz		.Loop_tail_avx512
+
+	vmovdqa32	$a_,0x00(%rsp)
+
+.Ldone_avx512:
+	vzeroall
+___
+$code.=<<___	if ($win64);
+	movaps	-0x28(%r9),%xmm6
+	movaps	-0x18(%r9),%xmm7
+___
+$code.=<<___;
+	lea	(%r9),%rsp
+.Lavx512_epilogue:
+	ret
+.size	ChaCha20_avx512,.-ChaCha20_avx512
+___
+}
+if ($avx>2) {
+# This one handles longer inputs...
+
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
+my  @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
+my @key=map("%zmm$_",(16..31));
+my ($xt0,$xt1,$xt2,$xt3)=@key[0..3];
+
+sub AVX512_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("\"$_\"",@xx);
+
+	(
+	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
+	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
+	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
+	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
+	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
+	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
+	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
+	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
+	"&vprold	(@x[$d0],@x[$d0],16)",
+	 "&vprold	(@x[$d1],@x[$d1],16)",
+	  "&vprold	(@x[$d2],@x[$d2],16)",
+	   "&vprold	(@x[$d3],@x[$d3],16)",
+
+	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
+	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
+	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
+	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
+	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
+	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
+	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
+	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
+	"&vprold	(@x[$b0],@x[$b0],12)",
+	 "&vprold	(@x[$b1],@x[$b1],12)",
+	  "&vprold	(@x[$b2],@x[$b2],12)",
+	   "&vprold	(@x[$b3],@x[$b3],12)",
+
+	"&vpaddd	(@x[$a0],@x[$a0],@x[$b0])",
+	 "&vpaddd	(@x[$a1],@x[$a1],@x[$b1])",
+	  "&vpaddd	(@x[$a2],@x[$a2],@x[$b2])",
+	   "&vpaddd	(@x[$a3],@x[$a3],@x[$b3])",
+	"&vpxord	(@x[$d0],@x[$d0],@x[$a0])",
+	 "&vpxord	(@x[$d1],@x[$d1],@x[$a1])",
+	  "&vpxord	(@x[$d2],@x[$d2],@x[$a2])",
+	   "&vpxord	(@x[$d3],@x[$d3],@x[$a3])",
+	"&vprold	(@x[$d0],@x[$d0],8)",
+	 "&vprold	(@x[$d1],@x[$d1],8)",
+	  "&vprold	(@x[$d2],@x[$d2],8)",
+	   "&vprold	(@x[$d3],@x[$d3],8)",
+
+	"&vpaddd	(@x[$c0],@x[$c0],@x[$d0])",
+	 "&vpaddd	(@x[$c1],@x[$c1],@x[$d1])",
+	  "&vpaddd	(@x[$c2],@x[$c2],@x[$d2])",
+	   "&vpaddd	(@x[$c3],@x[$c3],@x[$d3])",
+	"&vpxord	(@x[$b0],@x[$b0],@x[$c0])",
+	 "&vpxord	(@x[$b1],@x[$b1],@x[$c1])",
+	  "&vpxord	(@x[$b2],@x[$b2],@x[$c2])",
+	   "&vpxord	(@x[$b3],@x[$b3],@x[$c3])",
+	"&vprold	(@x[$b0],@x[$b0],7)",
+	 "&vprold	(@x[$b1],@x[$b1],7)",
+	  "&vprold	(@x[$b2],@x[$b2],7)",
+	   "&vprold	(@x[$b3],@x[$b3],7)"
+	);
+}
+
+my $xframe = $win64 ? 0xa8 : 8;
+
+$code.=<<___;
+.type	ChaCha20_16x,\@function,5
+.align	32
+ChaCha20_16x:
+.LChaCha20_16x:
+	mov		%rsp,%r9		# frame register
+	sub		\$64+$xframe,%rsp
+	and		\$-64,%rsp
+___
+$code.=<<___	if ($win64);
+	movaps		%xmm6,-0xa8(%r9)
+	movaps		%xmm7,-0x98(%r9)
+	movaps		%xmm8,-0x88(%r9)
+	movaps		%xmm9,-0x78(%r9)
+	movaps		%xmm10,-0x68(%r9)
+	movaps		%xmm11,-0x58(%r9)
+	movaps		%xmm12,-0x48(%r9)
+	movaps		%xmm13,-0x38(%r9)
+	movaps		%xmm14,-0x28(%r9)
+	movaps		%xmm15,-0x18(%r9)
+.L16x_body:
+___
+$code.=<<___;
+	vzeroupper
+
+	lea		.Lsigma(%rip),%r10
+	vbroadcasti32x4	(%r10),$xa3		# key[0]
+	vbroadcasti32x4	($key),$xb3		# key[1]
+	vbroadcasti32x4	16($key),$xc3		# key[2]
+	vbroadcasti32x4	($counter),$xd3		# key[3]
+
+	vpshufd		\$0x00,$xa3,$xa0	# smash key by lanes...
+	vpshufd		\$0x55,$xa3,$xa1
+	vpshufd		\$0xaa,$xa3,$xa2
+	vpshufd		\$0xff,$xa3,$xa3
+	vmovdqa64	$xa0,@key[0]
+	vmovdqa64	$xa1,@key[1]
+	vmovdqa64	$xa2,@key[2]
+	vmovdqa64	$xa3,@key[3]
+
+	vpshufd		\$0x00,$xb3,$xb0
+	vpshufd		\$0x55,$xb3,$xb1
+	vpshufd		\$0xaa,$xb3,$xb2
+	vpshufd		\$0xff,$xb3,$xb3
+	vmovdqa64	$xb0,@key[4]
+	vmovdqa64	$xb1,@key[5]
+	vmovdqa64	$xb2,@key[6]
+	vmovdqa64	$xb3,@key[7]
+
+	vpshufd		\$0x00,$xc3,$xc0
+	vpshufd		\$0x55,$xc3,$xc1
+	vpshufd		\$0xaa,$xc3,$xc2
+	vpshufd		\$0xff,$xc3,$xc3
+	vmovdqa64	$xc0,@key[8]
+	vmovdqa64	$xc1,@key[9]
+	vmovdqa64	$xc2,@key[10]
+	vmovdqa64	$xc3,@key[11]
+
+	vpshufd		\$0x00,$xd3,$xd0
+	vpshufd		\$0x55,$xd3,$xd1
+	vpshufd		\$0xaa,$xd3,$xd2
+	vpshufd		\$0xff,$xd3,$xd3
+	vpaddd		.Lincz(%rip),$xd0,$xd0	# don't save counters yet
+	vmovdqa64	$xd0,@key[12]
+	vmovdqa64	$xd1,@key[13]
+	vmovdqa64	$xd2,@key[14]
+	vmovdqa64	$xd3,@key[15]
+
+	mov		\$10,%eax
+	jmp		.Loop16x
+
+.align	32
+.Loop_outer16x:
+	vpbroadcastd	0(%r10),$xa0		# reload key
+	vpbroadcastd	4(%r10),$xa1
+	vpbroadcastd	8(%r10),$xa2
+	vpbroadcastd	12(%r10),$xa3
+	vpaddd		.Lsixteen(%rip),@key[12],@key[12]	# next SIMD counters
+	vmovdqa64	@key[4],$xb0
+	vmovdqa64	@key[5],$xb1
+	vmovdqa64	@key[6],$xb2
+	vmovdqa64	@key[7],$xb3
+	vmovdqa64	@key[8],$xc0
+	vmovdqa64	@key[9],$xc1
+	vmovdqa64	@key[10],$xc2
+	vmovdqa64	@key[11],$xc3
+	vmovdqa64	@key[12],$xd0
+	vmovdqa64	@key[13],$xd1
+	vmovdqa64	@key[14],$xd2
+	vmovdqa64	@key[15],$xd3
+
+	vmovdqa64	$xa0,@key[0]
+	vmovdqa64	$xa1,@key[1]
+	vmovdqa64	$xa2,@key[2]
+	vmovdqa64	$xa3,@key[3]
+
+	mov		\$10,%eax
+	jmp		.Loop16x
+
+.align	32
+.Loop16x:
+___
+	foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; }
+	foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; }
+$code.=<<___;
+	dec		%eax
+	jnz		.Loop16x
+
+	vpaddd		@key[0],$xa0,$xa0	# accumulate key
+	vpaddd		@key[1],$xa1,$xa1
+	vpaddd		@key[2],$xa2,$xa2
+	vpaddd		@key[3],$xa3,$xa3
+
+	vpunpckldq	$xa1,$xa0,$xt2		# "de-interlace" data
+	vpunpckldq	$xa3,$xa2,$xt3
+	vpunpckhdq	$xa1,$xa0,$xa0
+	vpunpckhdq	$xa3,$xa2,$xa2
+	vpunpcklqdq	$xt3,$xt2,$xa1		# "a0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "a1"
+	vpunpcklqdq	$xa2,$xa0,$xa3		# "a2"
+	vpunpckhqdq	$xa2,$xa0,$xa0		# "a3"
+___
+	($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2);
+$code.=<<___;
+	vpaddd		@key[4],$xb0,$xb0
+	vpaddd		@key[5],$xb1,$xb1
+	vpaddd		@key[6],$xb2,$xb2
+	vpaddd		@key[7],$xb3,$xb3
+
+	vpunpckldq	$xb1,$xb0,$xt2
+	vpunpckldq	$xb3,$xb2,$xt3
+	vpunpckhdq	$xb1,$xb0,$xb0
+	vpunpckhdq	$xb3,$xb2,$xb2
+	vpunpcklqdq	$xt3,$xt2,$xb1		# "b0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "b1"
+	vpunpcklqdq	$xb2,$xb0,$xb3		# "b2"
+	vpunpckhqdq	$xb2,$xb0,$xb0		# "b3"
+___
+	($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2);
+$code.=<<___;
+	vshufi32x4	\$0x44,$xb0,$xa0,$xt3	# "de-interlace" further
+	vshufi32x4	\$0xee,$xb0,$xa0,$xb0
+	vshufi32x4	\$0x44,$xb1,$xa1,$xa0
+	vshufi32x4	\$0xee,$xb1,$xa1,$xb1
+	vshufi32x4	\$0x44,$xb2,$xa2,$xa1
+	vshufi32x4	\$0xee,$xb2,$xa2,$xb2
+	vshufi32x4	\$0x44,$xb3,$xa3,$xa2
+	vshufi32x4	\$0xee,$xb3,$xa3,$xb3
+___
+	($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3);
+$code.=<<___;
+	vpaddd		@key[8],$xc0,$xc0
+	vpaddd		@key[9],$xc1,$xc1
+	vpaddd		@key[10],$xc2,$xc2
+	vpaddd		@key[11],$xc3,$xc3
+
+	vpunpckldq	$xc1,$xc0,$xt2
+	vpunpckldq	$xc3,$xc2,$xt3
+	vpunpckhdq	$xc1,$xc0,$xc0
+	vpunpckhdq	$xc3,$xc2,$xc2
+	vpunpcklqdq	$xt3,$xt2,$xc1		# "c0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "c1"
+	vpunpcklqdq	$xc2,$xc0,$xc3		# "c2"
+	vpunpckhqdq	$xc2,$xc0,$xc0		# "c3"
+___
+	($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2);
+$code.=<<___;
+	vpaddd		@key[12],$xd0,$xd0
+	vpaddd		@key[13],$xd1,$xd1
+	vpaddd		@key[14],$xd2,$xd2
+	vpaddd		@key[15],$xd3,$xd3
+
+	vpunpckldq	$xd1,$xd0,$xt2
+	vpunpckldq	$xd3,$xd2,$xt3
+	vpunpckhdq	$xd1,$xd0,$xd0
+	vpunpckhdq	$xd3,$xd2,$xd2
+	vpunpcklqdq	$xt3,$xt2,$xd1		# "d0"
+	vpunpckhqdq	$xt3,$xt2,$xt2		# "d1"
+	vpunpcklqdq	$xd2,$xd0,$xd3		# "d2"
+	vpunpckhqdq	$xd2,$xd0,$xd0		# "d3"
+___
+	($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2);
+$code.=<<___;
+	vshufi32x4	\$0x44,$xd0,$xc0,$xt3	# "de-interlace" further
+	vshufi32x4	\$0xee,$xd0,$xc0,$xd0
+	vshufi32x4	\$0x44,$xd1,$xc1,$xc0
+	vshufi32x4	\$0xee,$xd1,$xc1,$xd1
+	vshufi32x4	\$0x44,$xd2,$xc2,$xc1
+	vshufi32x4	\$0xee,$xd2,$xc2,$xd2
+	vshufi32x4	\$0x44,$xd3,$xc3,$xc2
+	vshufi32x4	\$0xee,$xd3,$xc3,$xd3
+___
+	($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3);
+$code.=<<___;
+	vshufi32x4	\$0x88,$xc0,$xa0,$xt0	# "de-interlace" further
+	vshufi32x4	\$0xdd,$xc0,$xa0,$xa0
+	 vshufi32x4	\$0x88,$xd0,$xb0,$xc0
+	 vshufi32x4	\$0xdd,$xd0,$xb0,$xd0
+	vshufi32x4	\$0x88,$xc1,$xa1,$xt1
+	vshufi32x4	\$0xdd,$xc1,$xa1,$xa1
+	 vshufi32x4	\$0x88,$xd1,$xb1,$xc1
+	 vshufi32x4	\$0xdd,$xd1,$xb1,$xd1
+	vshufi32x4	\$0x88,$xc2,$xa2,$xt2
+	vshufi32x4	\$0xdd,$xc2,$xa2,$xa2
+	 vshufi32x4	\$0x88,$xd2,$xb2,$xc2
+	 vshufi32x4	\$0xdd,$xd2,$xb2,$xd2
+	vshufi32x4	\$0x88,$xc3,$xa3,$xt3
+	vshufi32x4	\$0xdd,$xc3,$xa3,$xa3
+	 vshufi32x4	\$0x88,$xd3,$xb3,$xc3
+	 vshufi32x4	\$0xdd,$xd3,$xb3,$xd3
+___
+	($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)=
+	($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3);
+
+	($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1,
+	 $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) =
+	($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+	 $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3);
+$code.=<<___;
+	cmp		\$64*16,$len
+	jb		.Ltail16x
+
+	vpxord		0x00($inp),$xa0,$xa0	# xor with input
+	vpxord		0x40($inp),$xb0,$xb0
+	vpxord		0x80($inp),$xc0,$xc0
+	vpxord		0xc0($inp),$xd0,$xd0
+	vmovdqu32	$xa0,0x00($out)
+	vmovdqu32	$xb0,0x40($out)
+	vmovdqu32	$xc0,0x80($out)
+	vmovdqu32	$xd0,0xc0($out)
+
+	vpxord		0x100($inp),$xa1,$xa1
+	vpxord		0x140($inp),$xb1,$xb1
+	vpxord		0x180($inp),$xc1,$xc1
+	vpxord		0x1c0($inp),$xd1,$xd1
+	vmovdqu32	$xa1,0x100($out)
+	vmovdqu32	$xb1,0x140($out)
+	vmovdqu32	$xc1,0x180($out)
+	vmovdqu32	$xd1,0x1c0($out)
+
+	vpxord		0x200($inp),$xa2,$xa2
+	vpxord		0x240($inp),$xb2,$xb2
+	vpxord		0x280($inp),$xc2,$xc2
+	vpxord		0x2c0($inp),$xd2,$xd2
+	vmovdqu32	$xa2,0x200($out)
+	vmovdqu32	$xb2,0x240($out)
+	vmovdqu32	$xc2,0x280($out)
+	vmovdqu32	$xd2,0x2c0($out)
+
+	vpxord		0x300($inp),$xa3,$xa3
+	vpxord		0x340($inp),$xb3,$xb3
+	vpxord		0x380($inp),$xc3,$xc3
+	vpxord		0x3c0($inp),$xd3,$xd3
+	lea		0x400($inp),$inp
+	vmovdqu32	$xa3,0x300($out)
+	vmovdqu32	$xb3,0x340($out)
+	vmovdqu32	$xc3,0x380($out)
+	vmovdqu32	$xd3,0x3c0($out)
+	lea		0x400($out),$out
+
+	sub		\$64*16,$len
+	jnz		.Loop_outer16x
+
+	jmp		.Ldone16x
+
+.align	32
+.Ltail16x:
+	xor		%r10,%r10
+	sub		$inp,$out
+	cmp		\$64*1,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xa0,$xa0	# xor with input
+	vmovdqu32	$xa0,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xb0,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*2,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xb0,$xb0
+	vmovdqu32	$xb0,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xc0,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*3,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xc0,$xc0
+	vmovdqu32	$xc0,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xd0,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*4,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xd0,$xd0
+	vmovdqu32	$xd0,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xa1,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*5,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xa1,$xa1
+	vmovdqu32	$xa1,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xb1,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*6,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xb1,$xb1
+	vmovdqu32	$xb1,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xc1,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*7,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xc1,$xc1
+	vmovdqu32	$xc1,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xd1,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*8,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xd1,$xd1
+	vmovdqu32	$xd1,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xa2,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*9,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xa2,$xa2
+	vmovdqu32	$xa2,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xb2,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*10,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xb2,$xb2
+	vmovdqu32	$xb2,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xc2,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*11,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xc2,$xc2
+	vmovdqu32	$xc2,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xd2,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*12,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xd2,$xd2
+	vmovdqu32	$xd2,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xa3,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*13,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xa3,$xa3
+	vmovdqu32	$xa3,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xb3,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*14,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xb3,$xb3
+	vmovdqu32	$xb3,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xc3,$xa0
+	lea		64($inp),$inp
+
+	cmp		\$64*15,$len
+	jb		.Less_than_64_16x
+	vpxord		($inp),$xc3,$xc3
+	vmovdqu32	$xc3,($out,$inp)
+	je		.Ldone16x
+	vmovdqa32	$xd3,$xa0
+	lea		64($inp),$inp
+
+.Less_than_64_16x:
+	vmovdqa32	$xa0,0x00(%rsp)
+	lea		($out,$inp),$out
+	and		\$63,$len
+
+.Loop_tail16x:
+	movzb		($inp,%r10),%eax
+	movzb		(%rsp,%r10),%ecx
+	lea		1(%r10),%r10
+	xor		%ecx,%eax
+	mov		%al,-1($out,%r10)
+	dec		$len
+	jnz		.Loop_tail16x
+
+	vpxord		$xa0,$xa0,$xa0
+	vmovdqa32	$xa0,0(%rsp)
+
+.Ldone16x:
+	vzeroall
+___
+$code.=<<___	if ($win64);
+	movaps		-0xa8(%r9),%xmm6
+	movaps		-0x98(%r9),%xmm7
+	movaps		-0x88(%r9),%xmm8
+	movaps		-0x78(%r9),%xmm9
+	movaps		-0x68(%r9),%xmm10
+	movaps		-0x58(%r9),%xmm11
+	movaps		-0x48(%r9),%xmm12
+	movaps		-0x38(%r9),%xmm13
+	movaps		-0x28(%r9),%xmm14
+	movaps		-0x18(%r9),%xmm15
+___
+$code.=<<___;
+	lea		(%r9),%rsp
+.L16x_epilogue:
+	ret
+.size	ChaCha20_16x,.-ChaCha20_16x
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lprologue
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lno_data(%rip),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lcommon_seh_tail
+
+	lea	64+24+48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R14
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	192($context),%rax	# pull context->R9
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	-0x28(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$4,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lcommon_seh_tail
+.size	ssse3_handler,.-ssse3_handler
+
+.type	full_handler,\@abi-omnipotent
+.align	16
+full_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	192($context),%rax	# pull context->R9
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	-0xa8(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+
+	jmp	.Lcommon_seh_tail
+.size	full_handler,.-full_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_ChaCha20_ctr32
+	.rva	.LSEH_end_ChaCha20_ctr32
+	.rva	.LSEH_info_ChaCha20_ctr32
+
+	.rva	.LSEH_begin_ChaCha20_ssse3
+	.rva	.LSEH_end_ChaCha20_ssse3
+	.rva	.LSEH_info_ChaCha20_ssse3
+
+	.rva	.LSEH_begin_ChaCha20_4x
+	.rva	.LSEH_end_ChaCha20_4x
+	.rva	.LSEH_info_ChaCha20_4x
+___
+$code.=<<___ if ($avx>1);
+	.rva	.LSEH_begin_ChaCha20_8x
+	.rva	.LSEH_end_ChaCha20_8x
+	.rva	.LSEH_info_ChaCha20_8x
+___
+$code.=<<___ if ($avx>2);
+	.rva	.LSEH_begin_ChaCha20_avx512
+	.rva	.LSEH_end_ChaCha20_avx512
+	.rva	.LSEH_info_ChaCha20_avx512
+
+	.rva	.LSEH_begin_ChaCha20_16x
+	.rva	.LSEH_end_ChaCha20_16x
+	.rva	.LSEH_info_ChaCha20_16x
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_ChaCha20_ctr32:
+	.byte	9,0,0,0
+	.rva	se_handler
+
+.LSEH_info_ChaCha20_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lssse3_body,.Lssse3_epilogue
+
+.LSEH_info_ChaCha20_4x:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.L4x_body,.L4x_epilogue
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_ChaCha20_8x:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.L8x_body,.L8x_epilogue			# HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_ChaCha20_avx512:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lavx512_body,.Lavx512_epilogue		# HandlerData[]
+
+.LSEH_info_ChaCha20_16x:
+	.byte	9,0,0,0
+	.rva	full_handler
+	.rva	.L16x_body,.L16x_epilogue		# HandlerData[]
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/%x#%[yz]/%x/g;	# "down-shift"
 
 	print $_,"\n";
 }
diff --git a/src/crypto/chacha/chacha_test.cc b/src/crypto/chacha/chacha_test.cc
index 6bfb03e..a40653f 100644
--- a/src/crypto/chacha/chacha_test.cc
+++ b/src/crypto/chacha/chacha_test.cc
@@ -18,10 +18,13 @@
 
 #include <memory>
 
+#include <gtest/gtest.h>
+
 #include <openssl/crypto.h>
 #include <openssl/chacha.h>
 
 #include "../internal.h"
+#include "../test/test_util.h"
 
 
 static const uint8_t kKey[32] = {
@@ -216,35 +219,18 @@
 static_assert(sizeof(kInput) == sizeof(kOutput),
               "Input and output lengths don't match.");
 
-static bool TestChaCha20(size_t len) {
-  std::unique_ptr<uint8_t[]> buf(new uint8_t[len]);
-  CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter);
-  if (OPENSSL_memcmp(buf.get(), kOutput, len) != 0) {
-    fprintf(stderr, "Mismatch at length %zu.\n", len);
-    return false;
-  }
-
-  // Test in-place.
-  OPENSSL_memcpy(buf.get(), kInput, len);
-  CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter);
-  if (OPENSSL_memcmp(buf.get(), kOutput, len) != 0) {
-    fprintf(stderr, "Mismatch at length %zu, in-place.\n", len);
-    return false;
-  }
-
-  return true;
-}
-
-int main(int argc, char **argv) {
-  CRYPTO_library_init();
-
+TEST(ChaChaTest, TestVector) {
   // Run the test with the test vector at all lengths.
   for (size_t len = 0; len <= sizeof(kInput); len++) {
-    if (!TestChaCha20(len)) {
-      return 1;
-    }
-  }
+    SCOPED_TRACE(len);
 
-  printf("PASS\n");
-  return 0;
+    std::unique_ptr<uint8_t[]> buf(new uint8_t[len]);
+    CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kCounter);
+    EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len));
+
+    // Test the in-place version.
+    OPENSSL_memcpy(buf.get(), kInput, len);
+    CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kCounter);
+    EXPECT_EQ(Bytes(kOutput, len), Bytes(buf.get(), len));
+  }
 }
diff --git a/src/crypto/cipher/cipher.c b/src/crypto/cipher/cipher.c
index ae045ae..e46e43e 100644
--- a/src/crypto/cipher/cipher.c
+++ b/src/crypto/cipher/cipher.c
@@ -132,6 +132,7 @@
   if (in->cipher_data && in->cipher->ctx_size) {
     out->cipher_data = OPENSSL_malloc(in->cipher->ctx_size);
     if (!out->cipher_data) {
+      out->cipher = NULL;
       OPENSSL_PUT_ERROR(CIPHER, ERR_R_MALLOC_FAILURE);
       return 0;
     }
@@ -139,7 +140,10 @@
   }
 
   if (in->cipher->flags & EVP_CIPH_CUSTOM_COPY) {
-    return in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out);
+    if (!in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out)) {
+      out->cipher = NULL;
+      return 0;
+    }
   }
 
   return 1;
diff --git a/src/crypto/ec/CMakeLists.txt b/src/crypto/ec/CMakeLists.txt
index a54075c..75dccec 100644
--- a/src/crypto/ec/CMakeLists.txt
+++ b/src/crypto/ec/CMakeLists.txt
@@ -39,14 +39,6 @@
 )
 
 add_executable(
-  ec_test
-
-  ec_test.cc
-
-  $<TARGET_OBJECTS:test_support>
-)
-
-add_executable(
   p256-x86_64_test
 
   p256-x86_64_test.cc
@@ -55,6 +47,5 @@
 )
 
 target_link_libraries(example_mul crypto)
-target_link_libraries(ec_test crypto)
 target_link_libraries(p256-x86_64_test crypto)
-add_dependencies(all_tests example_mul ec_test p256-x86_64_test)
+add_dependencies(all_tests example_mul p256-x86_64_test)
diff --git a/src/crypto/ec/asm/p256-x86_64-asm.pl b/src/crypto/ec/asm/p256-x86_64-asm.pl
index 3cd7b01..517c506 100755
--- a/src/crypto/ec/asm/p256-x86_64-asm.pl
+++ b/src/crypto/ec/asm/p256-x86_64-asm.pl
@@ -289,7 +289,7 @@
 	adc	\$0, $acc0
 
 	########################################################################
-	# Second reduction step	
+	# Second reduction step
 	mov	$acc1, $t1
 	shl	\$32, $acc1
 	mulq	$poly3
@@ -336,7 +336,7 @@
 	adc	\$0, $acc1
 
 	########################################################################
-	# Third reduction step	
+	# Third reduction step
 	mov	$acc2, $t1
 	shl	\$32, $acc2
 	mulq	$poly3
@@ -383,7 +383,7 @@
 	adc	\$0, $acc2
 
 	########################################################################
-	# Final reduction step	
+	# Final reduction step
 	mov	$acc3, $t1
 	shl	\$32, $acc3
 	mulq	$poly3
@@ -396,7 +396,7 @@
 	 mov	$acc5, $t1
 	adc	\$0, $acc2
 
-	########################################################################	
+	########################################################################
 	# Branch-less conditional subtraction of P
 	sub	\$-1, $acc4		# .Lpoly[0]
 	 mov	$acc0, $t2
@@ -1649,7 +1649,7 @@
 	movq	%xmm1, $r_ptr
 	call	__ecp_nistz256_sqr_mont$x	# p256_sqr_mont(res_y, S);
 ___
-{	
+{
 ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
 # operate in 4-5-6-7 "name space" that matches squaring output
 #
@@ -1738,7 +1738,7 @@
 	lea	$M(%rsp), $b_ptr
 	mov	$acc4, $acc6			# harmonize sub output and mul input
 	xor	%ecx, %ecx
-	mov	$acc4, $S+8*0(%rsp)		# have to save:-(	
+	mov	$acc4, $S+8*0(%rsp)		# have to save:-(
 	mov	$acc5, $acc2
 	mov	$acc5, $S+8*1(%rsp)
 	cmovz	$acc0, $acc3
diff --git a/src/crypto/ec/ec_test.cc b/src/crypto/ec/ec_test.cc
index 31619b1..02b9ef2 100644
--- a/src/crypto/ec/ec_test.cc
+++ b/src/crypto/ec/ec_test.cc
@@ -17,6 +17,8 @@
 
 #include <vector>
 
+#include <gtest/gtest.h>
+
 #include <openssl/bn.h>
 #include <openssl/bytestring.h>
 #include <openssl/crypto.h>
@@ -24,6 +26,9 @@
 #include <openssl/err.h>
 #include <openssl/mem.h>
 #include <openssl/nid.h>
+#include <openssl/obj.h>
+
+#include "../test/test_util.h"
 
 
 // kECKeyWithoutPublic is an ECPrivateKey with the optional publicKey field
@@ -123,201 +128,75 @@
   return true;
 }
 
-static bool Testd2i_ECPrivateKey() {
-  bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeyWithoutPublic,
-                                        sizeof(kECKeyWithoutPublic));
-  if (!key) {
-    fprintf(stderr, "Failed to parse private key.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+TEST(ECTest, Encoding) {
+  bssl::UniquePtr<EC_KEY> key =
+      DecodeECPrivateKey(kECKeyWithoutPublic, sizeof(kECKeyWithoutPublic));
+  ASSERT_TRUE(key);
 
+  // Test that the encoding round-trips.
   std::vector<uint8_t> out;
-  if (!EncodeECPrivateKey(&out, key.get())) {
-    fprintf(stderr, "Failed to serialize private key.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  if (std::vector<uint8_t>(kECKeyWithoutPublic,
-                           kECKeyWithoutPublic + sizeof(kECKeyWithoutPublic)) !=
-      out) {
-    fprintf(stderr, "Serialisation of key doesn't match original.\n");
-    return false;
-  }
+  ASSERT_TRUE(EncodeECPrivateKey(&out, key.get()));
+  EXPECT_EQ(Bytes(kECKeyWithoutPublic), Bytes(out.data(), out.size()));
 
   const EC_POINT *pub_key = EC_KEY_get0_public_key(key.get());
-  if (pub_key == NULL) {
-    fprintf(stderr, "Public key missing.\n");
-    return false;
-  }
+  ASSERT_TRUE(pub_key) << "Public key missing";
 
   bssl::UniquePtr<BIGNUM> x(BN_new());
   bssl::UniquePtr<BIGNUM> y(BN_new());
-  if (!x || !y) {
-    return false;
-  }
-  if (!EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
-                                           pub_key, x.get(), y.get(), NULL)) {
-    fprintf(stderr, "Failed to get public key in affine coordinates.\n");
-    return false;
-  }
+  ASSERT_TRUE(x);
+  ASSERT_TRUE(y);
+  ASSERT_TRUE(EC_POINT_get_affine_coordinates_GFp(
+      EC_KEY_get0_group(key.get()), pub_key, x.get(), y.get(), NULL));
   bssl::UniquePtr<char> x_hex(BN_bn2hex(x.get()));
   bssl::UniquePtr<char> y_hex(BN_bn2hex(y.get()));
-  if (!x_hex || !y_hex) {
-    return false;
-  }
-  if (0 != strcmp(
-          x_hex.get(),
-          "c81561ecf2e54edefe6617db1c7a34a70744ddb261f269b83dacfcd2ade5a681") ||
-      0 != strcmp(
-          y_hex.get(),
-          "e0e2afa3f9b6abe4c698ef6495f1be49a3196c5056acb3763fe4507eec596e88")) {
-    fprintf(stderr, "Incorrect public key: %s %s\n", x_hex.get(), y_hex.get());
-    return false;
-  }
+  ASSERT_TRUE(x_hex);
+  ASSERT_TRUE(y_hex);
 
-  return true;
+  EXPECT_STREQ(
+      "c81561ecf2e54edefe6617db1c7a34a70744ddb261f269b83dacfcd2ade5a681",
+      x_hex.get());
+  EXPECT_STREQ(
+      "e0e2afa3f9b6abe4c698ef6495f1be49a3196c5056acb3763fe4507eec596e88",
+      y_hex.get());
 }
 
-static bool TestZeroPadding() {
+TEST(ECTest, ZeroPadding) {
   // Check that the correct encoding round-trips.
-  bssl::UniquePtr<EC_KEY> key = DecodeECPrivateKey(kECKeyWithZeros,
-                                        sizeof(kECKeyWithZeros));
+  bssl::UniquePtr<EC_KEY> key =
+      DecodeECPrivateKey(kECKeyWithZeros, sizeof(kECKeyWithZeros));
+  ASSERT_TRUE(key);
   std::vector<uint8_t> out;
-  if (!key || !EncodeECPrivateKey(&out, key.get())) {
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  if (std::vector<uint8_t>(kECKeyWithZeros,
-                           kECKeyWithZeros + sizeof(kECKeyWithZeros)) != out) {
-    fprintf(stderr, "Serialisation of key was incorrect.\n");
-    return false;
-  }
+  EXPECT_TRUE(EncodeECPrivateKey(&out, key.get()));
+  EXPECT_EQ(Bytes(kECKeyWithZeros), Bytes(out.data(), out.size()));
 
   // Keys without leading zeros also parse, but they encode correctly.
   key = DecodeECPrivateKey(kECKeyMissingZeros, sizeof(kECKeyMissingZeros));
-  if (!key || !EncodeECPrivateKey(&out, key.get())) {
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  if (std::vector<uint8_t>(kECKeyWithZeros,
-                           kECKeyWithZeros + sizeof(kECKeyWithZeros)) != out) {
-    fprintf(stderr, "Serialisation of key was incorrect.\n");
-    return false;
-  }
-
-  return true;
+  ASSERT_TRUE(key);
+  EXPECT_TRUE(EncodeECPrivateKey(&out, key.get()));
+  EXPECT_EQ(Bytes(kECKeyWithZeros), Bytes(out.data(), out.size()));
 }
 
-static bool TestSpecifiedCurve() {
+TEST(ECTest, SpecifiedCurve) {
   // Test keys with specified curves may be decoded.
   bssl::UniquePtr<EC_KEY> key =
       DecodeECPrivateKey(kECKeySpecifiedCurve, sizeof(kECKeySpecifiedCurve));
-  if (!key) {
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  ASSERT_TRUE(key);
 
   // The group should have been interpreted as P-256.
-  if (EC_GROUP_get_curve_name(EC_KEY_get0_group(key.get())) !=
-      NID_X9_62_prime256v1) {
-    fprintf(stderr, "Curve name incorrect.\n");
-    return false;
-  }
+  EXPECT_EQ(NID_X9_62_prime256v1,
+            EC_GROUP_get_curve_name(EC_KEY_get0_group(key.get())));
 
   // Encoding the key should still use named form.
   std::vector<uint8_t> out;
-  if (!EncodeECPrivateKey(&out, key.get())) {
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-  if (std::vector<uint8_t>(kECKeyWithoutPublic,
-                           kECKeyWithoutPublic + sizeof(kECKeyWithoutPublic)) !=
-      out) {
-    fprintf(stderr, "Serialisation of key was incorrect.\n");
-    return false;
-  }
-
-  return true;
+  EXPECT_TRUE(EncodeECPrivateKey(&out, key.get()));
+  EXPECT_EQ(Bytes(kECKeyWithoutPublic), Bytes(out.data(), out.size()));
 }
 
-static bool TestSetAffine(const int nid) {
-  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-  if (!key) {
-    return false;
-  }
-
-  const EC_GROUP *const group = EC_KEY_get0_group(key.get());
-
-  if (!EC_KEY_generate_key(key.get())) {
-    fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  if (!EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()),
-                            nullptr)) {
-    fprintf(stderr, "generated point is not on curve with nid %d", nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  bssl::UniquePtr<BIGNUM> x(BN_new());
-  bssl::UniquePtr<BIGNUM> y(BN_new());
-  if (!EC_POINT_get_affine_coordinates_GFp(group,
-                                           EC_KEY_get0_public_key(key.get()),
-                                           x.get(), y.get(), nullptr)) {
-    fprintf(stderr, "EC_POINT_get_affine_coordinates_GFp failed with nid %d\n",
-            nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group));
-  if (!point) {
-    return false;
-  }
-
-  if (!EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(), y.get(),
-                                           nullptr)) {
-    fprintf(stderr, "EC_POINT_set_affine_coordinates_GFp failed with nid %d\n",
-            nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  // Subtract one from |y| to make the point no longer on the curve.
-  if (!BN_sub(y.get(), y.get(), BN_value_one())) {
-    return false;
-  }
-
-  bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group));
-  if (!invalid_point) {
-    return false;
-  }
-
-  if (EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(), x.get(),
-                                          y.get(), nullptr)) {
-    fprintf(stderr,
-            "EC_POINT_set_affine_coordinates_GFp succeeded with invalid "
-            "coordinates with nid %d\n",
-            nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  return true;
-}
-
-static bool TestArbitraryCurve() {
+TEST(ECTest, ArbitraryCurve) {
   // Make a P-256 key and extract the affine coordinates.
   bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(NID_X9_62_prime256v1));
-  if (!key || !EC_KEY_generate_key(key.get())) {
-    return false;
-  }
+  ASSERT_TRUE(key);
+  ASSERT_TRUE(EC_KEY_generate_key(key.get()));
 
   // Make an arbitrary curve which is identical to P-256.
   static const uint8_t kP[] = {
@@ -351,186 +230,161 @@
       0x9e, 0x84, 0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51,
   };
   bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
+  ASSERT_TRUE(ctx);
   bssl::UniquePtr<BIGNUM> p(BN_bin2bn(kP, sizeof(kP), nullptr));
+  ASSERT_TRUE(p);
   bssl::UniquePtr<BIGNUM> a(BN_bin2bn(kA, sizeof(kA), nullptr));
+  ASSERT_TRUE(a);
   bssl::UniquePtr<BIGNUM> b(BN_bin2bn(kB, sizeof(kB), nullptr));
+  ASSERT_TRUE(b);
   bssl::UniquePtr<BIGNUM> gx(BN_bin2bn(kX, sizeof(kX), nullptr));
+  ASSERT_TRUE(gx);
   bssl::UniquePtr<BIGNUM> gy(BN_bin2bn(kY, sizeof(kY), nullptr));
+  ASSERT_TRUE(gy);
   bssl::UniquePtr<BIGNUM> order(BN_bin2bn(kOrder, sizeof(kOrder), nullptr));
-  bssl::UniquePtr<BIGNUM> cofactor(BN_new());
-  if (!ctx || !p || !a || !b || !gx || !gy || !order || !cofactor ||
-      !BN_set_word(cofactor.get(), 1)) {
-    return false;
-  }
+  ASSERT_TRUE(order);
 
   bssl::UniquePtr<EC_GROUP> group(
       EC_GROUP_new_curve_GFp(p.get(), a.get(), b.get(), ctx.get()));
-  if (!group) {
-    return false;
-  }
+  ASSERT_TRUE(group);
   bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get()));
-  if (!generator ||
-      !EC_POINT_set_affine_coordinates_GFp(group.get(), generator.get(),
-                                           gx.get(), gy.get(), ctx.get()) ||
-      !EC_GROUP_set_generator(group.get(), generator.get(), order.get(),
-                              cofactor.get())) {
-    return false;
-  }
+  ASSERT_TRUE(generator);
+  ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp(
+      group.get(), generator.get(), gx.get(), gy.get(), ctx.get()));
+  ASSERT_TRUE(EC_GROUP_set_generator(group.get(), generator.get(), order.get(),
+                                     BN_value_one()));
 
   // |group| should not have a curve name.
-  if (EC_GROUP_get_curve_name(group.get()) != NID_undef) {
-    return false;
-  }
+  EXPECT_EQ(NID_undef, EC_GROUP_get_curve_name(group.get()));
 
   // Copy |key| to |key2| using |group|.
   bssl::UniquePtr<EC_KEY> key2(EC_KEY_new());
+  ASSERT_TRUE(key2);
   bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  ASSERT_TRUE(point);
   bssl::UniquePtr<BIGNUM> x(BN_new()), y(BN_new());
-  if (!key2 || !point || !x || !y ||
-      !EC_KEY_set_group(key2.get(), group.get()) ||
-      !EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())) ||
-      !EC_POINT_get_affine_coordinates_GFp(EC_KEY_get0_group(key.get()),
-                                           EC_KEY_get0_public_key(key.get()),
-                                           x.get(), y.get(), nullptr) ||
-      !EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(), x.get(),
-                                           y.get(), nullptr) ||
-      !EC_KEY_set_public_key(key2.get(), point.get())) {
-    fprintf(stderr, "Could not copy key.\n");
-    return false;
-  }
+  ASSERT_TRUE(x);
+  ASSERT_TRUE(EC_KEY_set_group(key2.get(), group.get()));
+  ASSERT_TRUE(
+      EC_KEY_set_private_key(key2.get(), EC_KEY_get0_private_key(key.get())));
+  ASSERT_TRUE(EC_POINT_get_affine_coordinates_GFp(
+      EC_KEY_get0_group(key.get()), EC_KEY_get0_public_key(key.get()), x.get(),
+      y.get(), nullptr));
+  ASSERT_TRUE(EC_POINT_set_affine_coordinates_GFp(group.get(), point.get(),
+                                                  x.get(), y.get(), nullptr));
+  ASSERT_TRUE(EC_KEY_set_public_key(key2.get(), point.get()));
 
   // The key must be valid according to the new group too.
-  if (!EC_KEY_check_key(key2.get())) {
-    fprintf(stderr, "Copied key is not valid.\n");
-    return false;
-  }
-
-  return true;
+  EXPECT_TRUE(EC_KEY_check_key(key2.get()));
 }
 
-static bool TestAddingEqualPoints(int nid) {
-  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(nid));
-  if (!key) {
-    return false;
-  }
+class ECCurveTest : public testing::TestWithParam<EC_builtin_curve> {};
+
+TEST_P(ECCurveTest, SetAffine) {
+  // Generate an EC_KEY.
+  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid));
+  ASSERT_TRUE(key);
+  ASSERT_TRUE(EC_KEY_generate_key(key.get()));
+
+  const EC_GROUP *const group = EC_KEY_get0_group(key.get());
+  EXPECT_TRUE(
+      EC_POINT_is_on_curve(group, EC_KEY_get0_public_key(key.get()), nullptr));
+
+  // Get the public key's coordinates.
+  bssl::UniquePtr<BIGNUM> x(BN_new());
+  ASSERT_TRUE(x);
+  bssl::UniquePtr<BIGNUM> y(BN_new());
+  ASSERT_TRUE(y);
+  EXPECT_TRUE(EC_POINT_get_affine_coordinates_GFp(
+      group, EC_KEY_get0_public_key(key.get()), x.get(), y.get(), nullptr));
+
+  // Points on the curve should be accepted.
+  auto point = bssl::UniquePtr<EC_POINT>(EC_POINT_new(group));
+  ASSERT_TRUE(point);
+  EXPECT_TRUE(EC_POINT_set_affine_coordinates_GFp(group, point.get(), x.get(),
+                                                  y.get(), nullptr));
+
+  // Subtract one from |y| to make the point no longer on the curve.
+  EXPECT_TRUE(BN_sub(y.get(), y.get(), BN_value_one()));
+
+  // Points not on the curve should be rejected.
+  bssl::UniquePtr<EC_POINT> invalid_point(EC_POINT_new(group));
+  ASSERT_TRUE(invalid_point);
+  EXPECT_FALSE(EC_POINT_set_affine_coordinates_GFp(group, invalid_point.get(),
+                                                   x.get(), y.get(), nullptr));
+}
+
+TEST_P(ECCurveTest, AddingEqualPoints) {
+  bssl::UniquePtr<EC_KEY> key(EC_KEY_new_by_curve_name(GetParam().nid));
+  ASSERT_TRUE(key);
+  ASSERT_TRUE(EC_KEY_generate_key(key.get()));
 
   const EC_GROUP *const group = EC_KEY_get0_group(key.get());
 
-  if (!EC_KEY_generate_key(key.get())) {
-    fprintf(stderr, "EC_KEY_generate_key failed with nid %d\n", nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
   bssl::UniquePtr<EC_POINT> p1(EC_POINT_new(group));
+  ASSERT_TRUE(p1);
+  ASSERT_TRUE(EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())));
+
   bssl::UniquePtr<EC_POINT> p2(EC_POINT_new(group));
+  ASSERT_TRUE(p2);
+  ASSERT_TRUE(EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get())));
+
   bssl::UniquePtr<EC_POINT> double_p1(EC_POINT_new(group));
-  bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group));
-  if (!p1 || !p2 || !double_p1 || !p1_plus_p2) {
-    return false;
-  }
-
-  if (!EC_POINT_copy(p1.get(), EC_KEY_get0_public_key(key.get())) ||
-      !EC_POINT_copy(p2.get(), EC_KEY_get0_public_key(key.get()))) {
-    fprintf(stderr, "EC_POINT_COPY failed with nid %d\n", nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
+  ASSERT_TRUE(double_p1);
   bssl::UniquePtr<BN_CTX> ctx(BN_CTX_new());
-  if (!ctx) {
-    return false;
-  }
+  ASSERT_TRUE(ctx);
+  ASSERT_TRUE(EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()));
 
-  if (!EC_POINT_dbl(group, double_p1.get(), p1.get(), ctx.get()) ||
-      !EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get())) {
-    fprintf(stderr, "Point operation failed with nid %d\n", nid);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  bssl::UniquePtr<EC_POINT> p1_plus_p2(EC_POINT_new(group));
+  ASSERT_TRUE(p1_plus_p2);
+  ASSERT_TRUE(
+      EC_POINT_add(group, p1_plus_p2.get(), p1.get(), p2.get(), ctx.get()));
 
-  if (EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()) != 0) {
-    fprintf(stderr, "A+A != 2A for nid %d", nid);
-    return false;
-  }
-
-  return true;
+  EXPECT_EQ(0,
+            EC_POINT_cmp(group, double_p1.get(), p1_plus_p2.get(), ctx.get()))
+      << "A+A != 2A";
 }
 
-static bool TestMulZero(int nid) {
-  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(nid));
-  if (!group) {
-    return false;
-  }
+TEST_P(ECCurveTest, MulZero) {
+  bssl::UniquePtr<EC_GROUP> group(EC_GROUP_new_by_curve_name(GetParam().nid));
+  ASSERT_TRUE(group);
 
   bssl::UniquePtr<EC_POINT> point(EC_POINT_new(group.get()));
+  ASSERT_TRUE(point);
   bssl::UniquePtr<BIGNUM> zero(BN_new());
-  if (!point || !zero) {
-    return false;
-  }
-
+  ASSERT_TRUE(zero);
   BN_zero(zero.get());
-  if (!EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr, nullptr,
-                    nullptr)) {
-    return false;
-  }
+  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), zero.get(), nullptr,
+                           nullptr, nullptr));
 
-  if (!EC_POINT_is_at_infinity(group.get(), point.get())) {
-    fprintf(stderr, "g * 0 did not return point at infinity.\n");
-    return false;
-  }
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+      << "g * 0 did not return point at infinity.";
 
   // Test that zero times an arbitrary point is also infinity. The generator is
   // used as the arbitrary point.
   bssl::UniquePtr<EC_POINT> generator(EC_POINT_new(group.get()));
-  bssl::UniquePtr<BIGNUM> one(BN_new());
-  if (!generator ||
-      !one ||
-      !BN_one(one.get()) ||
-      !EC_POINT_mul(group.get(), generator.get(), one.get(), nullptr, nullptr,
-                    nullptr) ||
-      !EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(),
-                    zero.get(), nullptr)) {
-    return false;
-  }
+  ASSERT_TRUE(generator);
+  ASSERT_TRUE(EC_POINT_mul(group.get(), generator.get(), BN_value_one(),
+                           nullptr, nullptr, nullptr));
+  ASSERT_TRUE(EC_POINT_mul(group.get(), point.get(), nullptr, generator.get(),
+                           zero.get(), nullptr));
 
-  if (!EC_POINT_is_at_infinity(group.get(), point.get())) {
-    fprintf(stderr, "p * 0 did not return point at infinity.\n");
-    return false;
-  }
-
-  return true;
+  EXPECT_TRUE(EC_POINT_is_at_infinity(group.get(), point.get()))
+      << "p * 0 did not return point at infinity.";
 }
 
-static bool ForEachCurve(bool (*test_func)(int nid)) {
+static std::vector<EC_builtin_curve> AllCurves() {
   const size_t num_curves = EC_get_builtin_curves(nullptr, 0);
   std::vector<EC_builtin_curve> curves(num_curves);
   EC_get_builtin_curves(curves.data(), num_curves);
-
-  for (const auto& curve : curves) {
-    if (!test_func(curve.nid)) {
-      fprintf(stderr, "Test failed for %s\n", curve.comment);
-      return false;
-    }
-  }
-
-  return true;
+  return curves;
 }
 
-int main() {
-  CRYPTO_library_init();
-
-  if (!Testd2i_ECPrivateKey() ||
-      !TestZeroPadding() ||
-      !TestSpecifiedCurve() ||
-      !ForEachCurve(TestSetAffine) ||
-      !ForEachCurve(TestAddingEqualPoints) ||
-      !ForEachCurve(TestMulZero) ||
-      !TestArbitraryCurve()) {
-    fprintf(stderr, "failed\n");
-    return 1;
-  }
-
-  printf("PASS\n");
-  return 0;
+static std::string CurveToString(
+    const testing::TestParamInfo<EC_builtin_curve> &params) {
+  // The comment field contains characters GTest rejects, so use the OBJ name.
+  return OBJ_nid2sn(params.param.nid);
 }
+
+INSTANTIATE_TEST_CASE_P(, ECCurveTest, testing::ValuesIn(AllCurves()),
+                        CurveToString);
diff --git a/src/crypto/ecdsa/ecdsa.c b/src/crypto/ecdsa/ecdsa.c
index 3432081..e1a0525 100644
--- a/src/crypto/ecdsa/ecdsa.c
+++ b/src/crypto/ecdsa/ecdsa.c
@@ -66,9 +66,10 @@
 
 
 int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig,
-               unsigned int *sig_len, EC_KEY *eckey) {
+               unsigned int *sig_len, const EC_KEY *eckey) {
   if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) {
-    return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len, eckey);
+    return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len,
+                                   (EC_KEY*) eckey /* cast away const */);
   }
 
   return ECDSA_sign_ex(type, digest, digest_len, sig, sig_len, NULL, NULL,
@@ -76,7 +77,7 @@
 }
 
 int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len,
-                 const uint8_t *sig, size_t sig_len, EC_KEY *eckey) {
+                 const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) {
   ECDSA_SIG *s;
   int ret = 0;
   uint8_t *der = NULL;
@@ -133,12 +134,12 @@
 }
 
 ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len,
-                         EC_KEY *key) {
+                         const EC_KEY *key) {
   return ECDSA_do_sign_ex(digest, digest_len, NULL, NULL, key);
 }
 
 int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
-                    const ECDSA_SIG *sig, EC_KEY *eckey) {
+                    const ECDSA_SIG *sig, const EC_KEY *eckey) {
   int ret = 0;
   BN_CTX *ctx;
   BIGNUM *u1, *u2, *m, *X;
@@ -224,7 +225,7 @@
   return ret;
 }
 
-static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
+static int ecdsa_sign_setup(const EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
                             BIGNUM **rp, const uint8_t *digest,
                             size_t digest_len) {
   BN_CTX *ctx = NULL;
@@ -338,13 +339,14 @@
   return ret;
 }
 
-int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv, BIGNUM **rp) {
+int ECDSA_sign_setup(const EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv,
+                     BIGNUM **rp) {
   return ecdsa_sign_setup(eckey, ctx, kinv, rp, NULL, 0);
 }
 
 ECDSA_SIG *ECDSA_do_sign_ex(const uint8_t *digest, size_t digest_len,
                             const BIGNUM *in_kinv, const BIGNUM *in_r,
-                            EC_KEY *eckey) {
+                            const EC_KEY *eckey) {
   int ok = 0;
   BIGNUM *kinv = NULL, *s, *m = NULL, *tmp = NULL;
   const BIGNUM *ckinv;
@@ -441,7 +443,7 @@
 
 int ECDSA_sign_ex(int type, const uint8_t *digest, size_t digest_len,
                   uint8_t *sig, unsigned int *sig_len, const BIGNUM *kinv,
-                  const BIGNUM *r, EC_KEY *eckey) {
+                  const BIGNUM *r, const EC_KEY *eckey) {
   int ret = 0;
   ECDSA_SIG *s = NULL;
 
diff --git a/src/crypto/evp/evp_ctx.c b/src/crypto/evp/evp_ctx.c
index 905aae9..a17a8cc 100644
--- a/src/crypto/evp/evp_ctx.c
+++ b/src/crypto/evp/evp_ctx.c
@@ -148,48 +148,40 @@
   OPENSSL_free(ctx);
 }
 
-EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *pctx) {
-  EVP_PKEY_CTX *rctx;
-
-  if (!pctx->pmeth || !pctx->pmeth->copy) {
+EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *ctx) {
+  if (!ctx->pmeth || !ctx->pmeth->copy) {
     return NULL;
   }
 
-  rctx = OPENSSL_malloc(sizeof(EVP_PKEY_CTX));
-  if (!rctx) {
+  EVP_PKEY_CTX *ret = OPENSSL_malloc(sizeof(EVP_PKEY_CTX));
+  if (!ret) {
     return NULL;
   }
 
-  OPENSSL_memset(rctx, 0, sizeof(EVP_PKEY_CTX));
+  OPENSSL_memset(ret, 0, sizeof(EVP_PKEY_CTX));
 
-  rctx->pmeth = pctx->pmeth;
-  rctx->engine = pctx->engine;
-  rctx->operation = pctx->operation;
+  ret->pmeth = ctx->pmeth;
+  ret->engine = ctx->engine;
+  ret->operation = ctx->operation;
 
-  if (pctx->pkey) {
-    EVP_PKEY_up_ref(pctx->pkey);
-    rctx->pkey = pctx->pkey;
-    if (rctx->pkey == NULL) {
-      goto err;
-    }
+  if (ctx->pkey != NULL) {
+    EVP_PKEY_up_ref(ctx->pkey);
+    ret->pkey = ctx->pkey;
   }
 
-  if (pctx->peerkey) {
-    EVP_PKEY_up_ref(pctx->peerkey);
-    rctx->peerkey = pctx->peerkey;
-    if (rctx->peerkey == NULL) {
-      goto err;
-    }
+  if (ctx->peerkey != NULL) {
+    EVP_PKEY_up_ref(ctx->peerkey);
+    ret->peerkey = ctx->peerkey;
   }
 
-  if (pctx->pmeth->copy(rctx, pctx) > 0) {
-    return rctx;
+  if (ctx->pmeth->copy(ret, ctx) <= 0) {
+    ret->pmeth = NULL;
+    EVP_PKEY_CTX_free(ret);
+    OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP);
+    return NULL;
   }
 
-err:
-  EVP_PKEY_CTX_free(rctx);
-  OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP);
-  return NULL;
+  return ret;
 }
 
 EVP_PKEY *EVP_PKEY_CTX_get0_pkey(EVP_PKEY_CTX *ctx) { return ctx->pkey; }
diff --git a/src/crypto/md5/asm/md5-586.pl b/src/crypto/md5/asm/md5-586.pl
index a237b0c..a032d9b 100644
--- a/src/crypto/md5/asm/md5-586.pl
+++ b/src/crypto/md5/asm/md5-586.pl
@@ -50,7 +50,7 @@
 	local($pos,$a,$b,$c,$d,$K,$ki,$s,$t)=@_;
 
 	&mov($tmp1,$C)  if $pos < 0;
-	&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one 
+	&mov($tmp2,&DWP($xo[$ki]*4,$K,"",0)) if $pos < 0; # very first one
 
 	# body proper
 
diff --git a/src/crypto/modes/asm/aesni-gcm-x86_64.pl b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
index e329741..139014f 100644
--- a/src/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/src/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -22,10 +22,11 @@
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
 # pressure with notable relative improvement, achieving 1.0 cycle per
-# byte processed with 128-bit key on Haswell processor, and 0.74 -
-# on Broadwell. [Mentioned results are raw profiled measurements for
-# favourable packet size, one divisible by 96. Applications using the
-# EVP interface will observe a few percent worse performance.]
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
diff --git a/src/crypto/modes/asm/ghash-armv4.pl b/src/crypto/modes/asm/ghash-armv4.pl
index 299eedc..1a03251 100644
--- a/src/crypto/modes/asm/ghash-armv4.pl
+++ b/src/crypto/modes/asm/ghash-armv4.pl
@@ -47,7 +47,7 @@
 #
 # Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
-# 
+#
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
 # ====================================================================
@@ -486,7 +486,7 @@
 #ifdef __ARMEL__
 	vrev64.8	$Xl,$Xl
 #endif
-	sub		$Xi,#16	
+	sub		$Xi,#16
 	vst1.64		$Xl#hi,[$Xi]!		@ write out Xi
 	vst1.64		$Xl#lo,[$Xi]
 
diff --git a/src/crypto/modes/asm/ghash-x86.pl b/src/crypto/modes/asm/ghash-x86.pl
index 182c29a..d3a79e1 100644
--- a/src/crypto/modes/asm/ghash-x86.pl
+++ b/src/crypto/modes/asm/ghash-x86.pl
@@ -88,7 +88,7 @@
 # where Tproc is time required for Karatsuba pre- and post-processing,
 # is more realistic estimate. In this case it gives ... 1.91 cycles.
 # Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
+# and one of the two multiplications the performance should be between
 # 1.91 and 2.16. As already mentioned, this implementation processes
 # one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
 # - in 2.02. x86_64 performance is better, because larger register
@@ -487,7 +487,7 @@
     &pxor	($red[1],$red[1]);
     &pxor	($red[2],$red[2]);
 
-    # Just like in "May" verson modulo-schedule for critical path in
+    # Just like in "May" version modulo-schedule for critical path in
     # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
     # is scheduled so late that rem_8bit[] has to be shifted *right*
     # by 16, which is why last argument to pinsrw is 2, which
@@ -576,7 +576,7 @@
     &bswap	($dat);
     &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
     &bswap	("ebx");
-    
+
     &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
     &jne	(&label("outer"));
   }
@@ -680,7 +680,7 @@
 	&psllq		($Xi,57);		#
 	&movdqa		($T1,$Xi);		#
 	&pslldq		($Xi,8);
-	&psrldq		($T1,8);		#	
+	&psrldq		($T1,8);		#
 	&pxor		($Xi,$T2);
 	&pxor		($Xhi,$T1);		#
 
@@ -850,7 +850,7 @@
 	  &psllq	($Xi,57);		#
 	  &movdqa	($T1,$Xi);		#
 	  &pslldq	($Xi,8);
-	  &psrldq	($T1,8);		#	
+	  &psrldq	($T1,8);		#
 	  &pxor		($Xi,$T2);
 	  &pxor		($Xhi,$T1);		#
 	&pshufd		($T1,$Xhn,0b01001110);
@@ -913,7 +913,7 @@
 	&movdqu		(&QWP(0,$Xip),$Xi);
 &function_end("gcm_ghash_clmul");
 
-} else {		# Algorith 5. Kept for reference purposes.
+} else {		# Algorithm 5. Kept for reference purposes.
 
 sub reduction_alg5 {	# 19/16 times faster than Intel version
 my ($Xhi,$Xi)=@_;
diff --git a/src/crypto/modes/asm/ghash-x86_64.pl b/src/crypto/modes/asm/ghash-x86_64.pl
index d7471e2..0e6e348 100644
--- a/src/crypto/modes/asm/ghash-x86_64.pl
+++ b/src/crypto/modes/asm/ghash-x86_64.pl
@@ -64,8 +64,10 @@
 # Ivy Bridge	1.80(+7%)
 # Haswell	0.55(+93%) (if system doesn't support AVX)
 # Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Goldmont	1.08(+24%)
 
 # March 2013
 #
@@ -74,8 +76,8 @@
 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
 # sub-optimally in comparison to above mentioned version. But thanks
 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor, and in
-# 0.29 on Broadwell.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
@@ -217,8 +219,12 @@
 .align	16
 gcm_gmult_4bit:
 	push	%rbx
-	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%rbp		# %rbp and others are pushed exclusively in
 	push	%r12		# order to reuse Win64 exception handler...
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$280,%rsp
 .Lgmult_prologue:
 
 	movzb	15($Xi),$Zlo
@@ -229,8 +235,9 @@
 	mov	$Zlo,8($Xi)
 	mov	$Zhi,($Xi)
 
-	mov	16(%rsp),%rbx
-	lea	24(%rsp),%rsp
+	lea	280+48(%rsp),%rsi
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lgmult_epilogue:
 	ret
 .size	gcm_gmult_4bit,.-gcm_gmult_4bit
@@ -380,14 +387,14 @@
 	mov	$Zlo,8($Xi)
 	mov	$Zhi,($Xi)
 
-	lea	280(%rsp),%rsi
-	mov	0(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	lea	280+48(%rsp),%rsi
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	0(%rsi),%rsp
 .Lghash_epilogue:
 	ret
 .size	gcm_ghash_4bit,.-gcm_ghash_4bit
@@ -449,7 +456,7 @@
 	psllq		\$57,$Xi		#
 	movdqa		$Xi,$T1			#
 	pslldq		\$8,$Xi
-	psrldq		\$8,$T1			#	
+	psrldq		\$8,$T1			#
 	pxor		$T2,$Xi
 	pxor		$T1,$Xhi		#
 
@@ -563,7 +570,7 @@
 	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
 $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
 	# experimental alternative. special thing about is that there
-	# no dependency between the two multiplications... 
+	# no dependency between the two multiplications...
 	mov		\$`0xE1<<1`,%eax
 	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
 	mov		\$0x07,%r11d
@@ -738,7 +745,7 @@
 	movdqa		$T2,$T1			#
 	pslldq		\$8,$T2
 	 pclmulqdq	\$0x00,$Hkey2,$Xln
-	psrldq		\$8,$T1			#	
+	psrldq		\$8,$T1			#
 	pxor		$T2,$Xi
 	pxor		$T1,$Xhi		#
 	movdqu		0($inp),$T1
@@ -874,7 +881,7 @@
 	  psllq		\$57,$Xi		#
 	  movdqa	$Xi,$T1			#
 	  pslldq	\$8,$Xi
-	  psrldq	\$8,$T1			#	
+	  psrldq	\$8,$T1			#
 	  pxor		$T2,$Xi
 	pshufd		\$0b01001110,$Xhn,$Xmn
 	  pxor		$T1,$Xhi		#
@@ -1628,14 +1635,20 @@
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lin_prologue
 
-	lea	24(%rax),%rax		# adjust "rsp"
+	lea	48+280(%rax),%rax	# adjust "rsp"
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
 
 .Lin_prologue:
 	mov	8(%rax),%rdi
diff --git a/src/crypto/perlasm/ppc-xlate.pl b/src/crypto/perlasm/ppc-xlate.pl
index 55b02bc..de796d7 100644
--- a/src/crypto/perlasm/ppc-xlate.pl
+++ b/src/crypto/perlasm/ppc-xlate.pl
@@ -36,7 +36,7 @@
     my $ret;
 
     $name =~ s|^\.||;
- 
+
     SWITCH: for ($flavour) {
 	/aix/		&& do { if (!$$type) {
 				    $$type = "\@function";
diff --git a/src/crypto/perlasm/readme b/src/crypto/perlasm/readme
index 648537b..57d2083 100644
--- a/src/crypto/perlasm/readme
+++ b/src/crypto/perlasm/readme
@@ -7,7 +7,7 @@
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
 
-The first thing we do is setup the file and type of assember
+The first thing we do is setup the file and type of assembler
 
 &asm_init($ARGV[0],$0);
 
@@ -18,7 +18,7 @@
 The reciprocal function is
 &asm_finish() which should be called at the end.
 
-There are 2 main 'packages'. x86ms.pl, which is the microsoft assembler,
+There are 2 main 'packages'. x86ms.pl, which is the Microsoft assembler,
 and x86unix.pl which is the unix (gas) version.
 
 Functions of interest are:
@@ -32,7 +32,7 @@
 &function_begin(name,extra)	Start a function with pushing of
 				edi, esi, ebx and ebp.  extra is extra win32
 				external info that may be required.
-&function_begin_B(name,extra)	Same as norma function_begin but no pushing.
+&function_begin_B(name,extra)	Same as normal function_begin but no pushing.
 &function_end(name)		Call at end of function.
 &function_end_A(name)		Standard pop and ret, for use inside functions
 &function_end_B(name)		Call at end but with poping or 'ret'.
diff --git a/src/crypto/perlasm/x86_64-xlate.pl b/src/crypto/perlasm/x86_64-xlate.pl
index 16553f2..6e487b8 100755
--- a/src/crypto/perlasm/x86_64-xlate.pl
+++ b/src/crypto/perlasm/x86_64-xlate.pl
@@ -141,7 +141,7 @@
 	if ($gas) {
 	    if ($self->{op} eq "movz") {	# movz is pain...
 		sprintf "%s%s%s",$self->{op},$self->{sz},shift;
-	    } elsif ($self->{op} =~ /^set/) { 
+	    } elsif ($self->{op} =~ /^set/) {
 		"$self->{op}";
 	    } elsif ($self->{op} eq "ret") {
 		my $epilogue = "";
@@ -168,7 +168,7 @@
 		$self->{op} .= $self->{sz};
 	    } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") {
 		$self->{op} = "\tDQ";
-	    } 
+	    }
 	    $self->{op};
 	}
     }
@@ -274,7 +274,7 @@
 	}
 
 	# if base register is %rbp or %r13, see if it's possible to
-	# flip base and ingex registers [for better performance]
+	# flip base and index registers [for better performance]
 	if (!$self->{label} && $self->{index} && $self->{scale}==1 &&
 	    $self->{base} =~ /(rbp|r13)/) {
 		$self->{base} = $self->{index}; $self->{index} = $1;
@@ -432,7 +432,7 @@
 	}
     }
 }
-{ package expr;		# pick up expressioins
+{ package expr;		# pick up expressions
     sub re {
 	my	($class, $line, $opcode) = @_;
 	my	$self = {};
@@ -460,6 +460,242 @@
 	}
     }
 }
+{ package cfi_directive;
+    # CFI directives annotate instructions that are significant for
+    # stack unwinding procedure compliant with DWARF specification,
+    # see http://dwarfstd.org/. Besides naturally expected for this
+    # script platform-specific filtering function, this module adds
+    # three auxiliary synthetic directives not recognized by [GNU]
+    # assembler:
+    #
+    # - .cfi_push to annotate push instructions in prologue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_offset;
+    # - .cfi_pop to annotate pop instructions in epilogue, which
+    #   translates to .cfi_adjust_cfa_offset (if needed) and
+    #   .cfi_restore;
+    # - [and most notably] .cfi_cfa_expression which encodes
+    #   DW_CFA_def_cfa_expression and passes it to .cfi_escape as
+    #   byte vector;
+    #
+    # CFA expressions were introduced in DWARF specification version
+    # 3 and describe how to deduce CFA, Canonical Frame Address. This
+    # becomes handy if your stack frame is variable and you can't
+    # spare register for [previous] frame pointer. Suggested directive
+    # syntax is made-up mix of DWARF operator suffixes [subset of]
+    # and references to registers with optional bias. Following example
+    # describes offloaded *original* stack pointer at specific offset
+    # from *current* stack pointer:
+    #
+    #   .cfi_cfa_expression     %rsp+40,deref,+8
+    #
+    # Final +8 has everything to do with the fact that CFA is defined
+    # as reference to top of caller's stack, and on x86_64 call to
+    # subroutine pushes 8-byte return address. In other words original
+    # stack pointer upon entry to a subroutine is 8 bytes off from CFA.
+
+    # Below constants are taken from "DWARF Expressions" section of the
+    # DWARF specification, section is numbered 7.7 in versions 3 and 4.
+    my %DW_OP_simple = (	# no-arg operators, mapped directly
+	deref	=> 0x06,	dup	=> 0x12,
+	drop	=> 0x13,	over	=> 0x14,
+	pick	=> 0x15,	swap	=> 0x16,
+	rot	=> 0x17,	xderef	=> 0x18,
+
+	abs	=> 0x19,	and	=> 0x1a,
+	div	=> 0x1b,	minus	=> 0x1c,
+	mod	=> 0x1d,	mul	=> 0x1e,
+	neg	=> 0x1f,	not	=> 0x20,
+	or	=> 0x21,	plus	=> 0x22,
+	shl	=> 0x24,	shr	=> 0x25,
+	shra	=> 0x26,	xor	=> 0x27,
+	);
+
+    my %DW_OP_complex = (	# used in specific subroutines
+	constu		=> 0x10,	# uleb128
+	consts		=> 0x11,	# sleb128
+	plus_uconst	=> 0x23,	# uleb128
+	lit0 		=> 0x30,	# add 0-31 to opcode
+	reg0		=> 0x50,	# add 0-31 to opcode
+	breg0		=> 0x70,	# add 0-31 to opcole, sleb128
+	regx		=> 0x90,	# uleb28
+	fbreg		=> 0x91,	# sleb128
+	bregx		=> 0x92,	# uleb128, sleb128
+	piece		=> 0x93,	# uleb128
+	);
+
+    # Following constants are defined in x86_64 ABI supplement, for
+    # example avaiable at https://www.uclibc.org/docs/psABI-x86_64.pdf,
+    # see section 3.7 "Stack Unwind Algorithm".
+    my %DW_reg_idx = (
+	"%rax"=>0,  "%rdx"=>1,  "%rcx"=>2,  "%rbx"=>3,
+	"%rsi"=>4,  "%rdi"=>5,  "%rbp"=>6,  "%rsp"=>7,
+	"%r8" =>8,  "%r9" =>9,  "%r10"=>10, "%r11"=>11,
+	"%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15
+	);
+
+    my ($cfa_reg, $cfa_rsp);
+
+    # [us]leb128 format is variable-length integer representation base
+    # 2^128, with most significant bit of each byte being 0 denoting
+    # *last* most significat digit. See "Variable Length Data" in the
+    # DWARF specification, numbered 7.6 at least in versions 3 and 4.
+    sub sleb128 {
+	use integer;	# get right shift extend sign
+
+	my $val = shift;
+	my $sign = ($val < 0) ? -1 : 0;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if remaining bits are same and equal to most
+	    # significant bit of the current digit, if so, it's
+	    # last digit...
+	    last if (($val>>6) == $sign);
+
+	    @ret[-1] |= 0x80;
+	    $val >>= 7;
+	}
+
+	return @ret;
+    }
+    sub uleb128 {
+	my $val = shift;
+	my @ret = ();
+
+	while(1) {
+	    push @ret, $val&0x7f;
+
+	    # see if it's last significant digit...
+	    last if (($val >>= 7) == 0);
+
+	    @ret[-1] |= 0x80;
+	}
+
+	return @ret;
+    }
+    sub const {
+	my $val = shift;
+
+	if ($val >= 0 && $val < 32) {
+            return ($DW_OP_complex{lit0}+$val);
+	}
+	return ($DW_OP_complex{consts}, sleb128($val));
+    }
+    sub reg {
+	my $val = shift;
+
+	return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/);
+
+	my $reg = $DW_reg_idx{$1};
+	my $off = eval ("0 $2 $3");
+
+	return (($DW_OP_complex{breg0} + $reg), sleb128($off));
+	# Yes, we use DW_OP_bregX+0 to push register value and not
+	# DW_OP_regX, because latter would require even DW_OP_piece,
+	# which would be a waste under the circumstances. If you have
+	# to use DWP_OP_reg, use "regx:N"...
+    }
+    sub cfa_expression {
+	my $line = shift;
+	my @ret;
+
+	foreach my $token (split(/,\s*/,$line)) {
+	    if ($token =~ /^%r/) {
+		push @ret,reg($token);
+	    } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) {
+		push @ret,reg("$2+$1");
+	    } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) {
+		my $i = 1*eval($2);
+		push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i));
+	    } elsif (my $i = 1*eval($token) or $token eq "0") {
+		if ($token =~ /^\+/) {
+		    push @ret,$DW_OP_complex{plus_uconst},uleb128($i);
+		} else {
+		    push @ret,const($i);
+		}
+	    } else {
+		push @ret,$DW_OP_simple{$token};
+	    }
+	}
+
+	# Finally we return DW_CFA_def_cfa_expression, 15, followed by
+	# length of the expression and of course the expression itself.
+	return (15,scalar(@ret),@ret);
+    }
+    sub re {
+	my	($class, $line) = @_;
+	my	$self = {};
+	my	$ret;
+
+	if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) {
+	    bless $self,$class;
+	    $ret = $self;
+	    undef $self->{value};
+	    my $dir = $1;
+
+	    SWITCH: for ($dir) {
+	    # What is $cfa_rsp? Effectively it's difference between %rsp
+	    # value and current CFA, Canonical Frame Address, which is
+	    # why it starts with -8. Recall that CFA is top of caller's
+	    # stack...
+	    /startproc/	&& do {	($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; };
+	    /endproc/	&& do {	($cfa_reg, $cfa_rsp) = ("%rsp",  0); last; };
+	    /def_cfa_register/
+			&& do {	$cfa_reg = $$line; last; };
+	    /def_cfa_offset/
+			&& do {	$cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp");
+				last;
+			      };
+	    /adjust_cfa_offset/
+			&& do {	$cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp");
+				last;
+			      };
+	    /def_cfa/	&& do {	if ($$line =~ /(%r\w+)\s*,\s*(.+)/) {
+				    $cfa_reg = $1;
+				    $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp");
+				}
+				last;
+			      };
+	    /push/	&& do {	$dir = undef;
+				$cfa_rsp -= 8;
+				if ($cfa_reg eq "%rsp") {
+				    $self->{value} = ".cfi_adjust_cfa_offset\t8\n";
+				}
+				$self->{value} .= ".cfi_offset\t$$line,$cfa_rsp";
+				last;
+			      };
+	    /pop/	&& do {	$dir = undef;
+				$cfa_rsp += 8;
+				if ($cfa_reg eq "%rsp") {
+				    $self->{value} = ".cfi_adjust_cfa_offset\t-8\n";
+				}
+				$self->{value} .= ".cfi_restore\t$$line";
+				last;
+			      };
+	    /cfa_expression/
+			&& do {	$dir = undef;
+				$self->{value} = ".cfi_escape\t" .
+					join(",", map(sprintf("0x%02x", $_),
+						      cfa_expression($$line)));
+				last;
+			      };
+	    }
+
+	    $self->{value} = ".cfi_$dir\t$$line" if ($dir);
+
+	    $$line = "";
+	}
+
+	return $ret;
+    }
+    sub out {
+	my $self = shift;
+	return ($elf ? $self->{value} : undef);
+    }
+}
 { package directive;	# pick up directives, which start with .
     sub re {
 	my	($class, $line) = @_;
@@ -467,6 +703,9 @@
 	my	$ret;
 	my	$dir;
 
+	# chain-call to cfi_directive
+	$ret = cfi_directive->re($line) and return $ret;
+
 	if ($$line =~ /^\s*(\.\w+)/) {
 	    bless $self,$class;
 	    $dir = $1;
@@ -644,7 +883,7 @@
 							if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva"))
 							{ $var=~s/([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; }
 							$var;
-						    };  
+						    };
 
 				    $sz =~ tr/bvlrq/BWDDQ/;
 				    $self->{value} = "\tD$sz\t";
@@ -654,7 +893,7 @@
 				  };
 		/\.byte/    && do { my @str=split(/,\s*/,$$line);
 				    map(s/(0b[0-1]+)/oct($1)/eig,@str);
-				    map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);	
+				    map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm);
 				    while ($#str>15) {
 					$self->{value}.="DB\t"
 						.join(",",@str[0..15])."\n";
@@ -810,7 +1049,7 @@
       my @opcode=();
       my $dst=$1;
 	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,0,$1,8);
+	rex(\@opcode,0,$dst,8);
 	push @opcode,0x0f,0xc7,0xf0|($dst&7);
 	@opcode;
     } else {
@@ -823,7 +1062,7 @@
       my @opcode=();
       my $dst=$1;
 	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
-	rex(\@opcode,0,$1,8);
+	rex(\@opcode,0,$dst,8);
 	push @opcode,0x0f,0xc7,0xf8|($dst&7);
 	@opcode;
     } else {
@@ -912,7 +1151,7 @@
 	printf "%s",$directive->out();
     } elsif (my $opcode=opcode->re(\$line)) {
 	my $asm = eval("\$".$opcode->mnemonic());
-	
+
 	if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) {
 	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
 	    next;
@@ -998,7 +1237,7 @@
 # %r13		-		-
 # %r14		-		-
 # %r15		-		-
-# 
+#
 # (*)	volatile register
 # (-)	preserved by callee
 # (#)	Nth argument, volatile
@@ -1021,7 +1260,7 @@
 # the area above user stack pointer in true asynchronous manner...
 #
 # All the above means that if assembler programmer adheres to Unix
-# register and stack layout, but disregards the "red zone" existense,
+# register and stack layout, but disregards the "red zone" existence,
 # it's possible to use following prologue and epilogue to "gear" from
 # Unix to Win64 ABI in leaf functions with not more than 6 arguments.
 #
diff --git a/src/crypto/perlasm/x86nasm.pl b/src/crypto/perlasm/x86nasm.pl
index d159514..d3773b6 100644
--- a/src/crypto/perlasm/x86nasm.pl
+++ b/src/crypto/perlasm/x86nasm.pl
@@ -140,7 +140,7 @@
 	grep {s/(^extern\s+${nmdecor}OPENSSL_ia32cap_P)/\;$1/} @out;
 	push (@out,$comm)
     }
-    push (@out,$initseg) if ($initseg);		
+    push (@out,$initseg) if ($initseg);
 }
 
 sub ::comment {   foreach (@_) { push(@out,"\t; $_\n"); }   }
diff --git a/src/crypto/pkcs8/pkcs8.c b/src/crypto/pkcs8/pkcs8.c
index efad81d..64a2d02 100644
--- a/src/crypto/pkcs8/pkcs8.c
+++ b/src/crypto/pkcs8/pkcs8.c
@@ -426,26 +426,9 @@
   return ret;
 }
 
-PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass,
-                                   int pass_len) {
-  uint8_t *pass_raw = NULL;
-  size_t pass_raw_len = 0;
-  if (!pass_to_pass_raw(OBJ_obj2nid(pkcs8->algor->algorithm), pass, pass_len,
-                        &pass_raw, &pass_raw_len)) {
-    return NULL;
-  }
-
-  PKCS8_PRIV_KEY_INFO *ret = PKCS8_decrypt_pbe(pkcs8, pass_raw, pass_raw_len);
-
-  if (pass_raw) {
-    OPENSSL_cleanse(pass_raw, pass_raw_len);
-    OPENSSL_free(pass_raw);
-  }
-  return ret;
-}
-
-PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8, const uint8_t *pass_raw,
-                                       size_t pass_raw_len) {
+static PKCS8_PRIV_KEY_INFO *pkcs8_decrypt_raw(X509_SIG *pkcs8,
+                                              const uint8_t *pass_raw,
+                                              size_t pass_raw_len) {
   PKCS8_PRIV_KEY_INFO *ret = NULL;
   uint8_t *in = NULL, *out = NULL;
   size_t out_len = 0;
@@ -495,17 +478,16 @@
   return ret;
 }
 
-X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass,
-                        int pass_len, const uint8_t *salt, size_t salt_len,
-                        int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
+PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass,
+                                   int pass_len) {
   uint8_t *pass_raw = NULL;
   size_t pass_raw_len = 0;
-  if (!pass_to_pass_raw(pbe_nid, pass, pass_len, &pass_raw, &pass_raw_len)) {
+  if (!pass_to_pass_raw(OBJ_obj2nid(pkcs8->algor->algorithm), pass, pass_len,
+                        &pass_raw, &pass_raw_len)) {
     return NULL;
   }
 
-  X509_SIG *ret = PKCS8_encrypt_pbe(pbe_nid, cipher, pass_raw, pass_raw_len,
-                                    salt, salt_len, iterations, p8inf);
+  PKCS8_PRIV_KEY_INFO *ret = pkcs8_decrypt_raw(pkcs8, pass_raw, pass_raw_len);
 
   if (pass_raw) {
     OPENSSL_cleanse(pass_raw, pass_raw_len);
@@ -514,10 +496,10 @@
   return ret;
 }
 
-X509_SIG *PKCS8_encrypt_pbe(int pbe_nid, const EVP_CIPHER *cipher,
-                            const uint8_t *pass_raw, size_t pass_raw_len,
-                            const uint8_t *salt, size_t salt_len,
-                            int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
+static X509_SIG *pkcs8_encrypt_raw(int pbe_nid, const EVP_CIPHER *cipher,
+                                   const uint8_t *pass_raw, size_t pass_raw_len,
+                                   const uint8_t *salt, size_t salt_len,
+                                   int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
   X509_SIG *ret = NULL;
   uint8_t *plaintext = NULL, *salt_buf = NULL, *der = NULL;
   int plaintext_len = -1;
@@ -609,6 +591,25 @@
   return ret;
 }
 
+X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass,
+                        int pass_len, const uint8_t *salt, size_t salt_len,
+                        int iterations, PKCS8_PRIV_KEY_INFO *p8inf) {
+  uint8_t *pass_raw = NULL;
+  size_t pass_raw_len = 0;
+  if (!pass_to_pass_raw(pbe_nid, pass, pass_len, &pass_raw, &pass_raw_len)) {
+    return NULL;
+  }
+
+  X509_SIG *ret = pkcs8_encrypt_raw(pbe_nid, cipher, pass_raw, pass_raw_len,
+                                    salt, salt_len, iterations, p8inf);
+
+  if (pass_raw) {
+    OPENSSL_cleanse(pass_raw, pass_raw_len);
+    OPENSSL_free(pass_raw);
+  }
+  return ret;
+}
+
 EVP_PKEY *EVP_PKCS82PKEY(PKCS8_PRIV_KEY_INFO *p8) {
   uint8_t *der = NULL;
   int der_len = i2d_PKCS8_PRIV_KEY_INFO(p8, &der);
@@ -758,7 +759,7 @@
     }
 
     PKCS8_PRIV_KEY_INFO *pki =
-        PKCS8_decrypt_pbe(encrypted, ctx->password, ctx->password_len);
+        pkcs8_decrypt_raw(encrypted, ctx->password, ctx->password_len);
     X509_SIG_free(encrypted);
     if (pki == NULL) {
       return 0;
diff --git a/src/crypto/rsa/CMakeLists.txt b/src/crypto/rsa/CMakeLists.txt
index 969b753..76937c1 100644
--- a/src/crypto/rsa/CMakeLists.txt
+++ b/src/crypto/rsa/CMakeLists.txt
@@ -11,14 +11,3 @@
   padding.c
   rsa_asn1.c
 )
-
-add_executable(
-  rsa_test
-
-  rsa_test.cc
-
-  $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(rsa_test crypto)
-add_dependencies(all_tests rsa_test)
\ No newline at end of file
diff --git a/src/crypto/rsa/rsa_test.cc b/src/crypto/rsa/rsa_test.cc
index 306df7e..401efdf 100644
--- a/src/crypto/rsa/rsa_test.cc
+++ b/src/crypto/rsa/rsa_test.cc
@@ -59,6 +59,8 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <gtest/gtest.h>
+
 #include <openssl/bn.h>
 #include <openssl/bytestring.h>
 #include <openssl/crypto.h>
@@ -66,6 +68,7 @@
 #include <openssl/nid.h>
 
 #include "../internal.h"
+#include "../test/test_util.h"
 
 
 // kPlaintext is a sample plaintext.
@@ -523,191 +526,172 @@
     0xdd, 0x02, 0x01, 0x01,
 };
 
-static bool TestRSA(const uint8_t *der, size_t der_len,
-                    const uint8_t *oaep_ciphertext,
-                    size_t oaep_ciphertext_len) {
-  bssl::UniquePtr<RSA> key(RSA_private_key_from_bytes(der, der_len));
-  if (!key) {
-    return false;
-  }
+struct RSAEncryptParam {
+  const uint8_t *der;
+  size_t der_len;
+  const uint8_t *oaep_ciphertext;
+  size_t oaep_ciphertext_len;
+} kRSAEncryptParams[] = {
+    {kKey1, sizeof(kKey1) - 1, kOAEPCiphertext1, sizeof(kOAEPCiphertext1) - 1},
+    {kKey2, sizeof(kKey2) - 1, kOAEPCiphertext2, sizeof(kOAEPCiphertext2) - 1},
+    {kKey3, sizeof(kKey3) - 1, kOAEPCiphertext3, sizeof(kOAEPCiphertext3) - 1},
+};
 
-  if (!RSA_check_key(key.get())) {
-    fprintf(stderr, "RSA_check_key failed\n");
-    return false;
-  }
+class RSAEncryptTest : public testing::TestWithParam<RSAEncryptParam> {};
+
+TEST_P(RSAEncryptTest, TestKey) {
+  const auto &param = GetParam();
+  bssl::UniquePtr<RSA> key(
+      RSA_private_key_from_bytes(param.der, param.der_len));
+  ASSERT_TRUE(key);
+
+  EXPECT_TRUE(RSA_check_key(key.get()));
 
   uint8_t ciphertext[256];
 
+  // Test that PKCS#1 v1.5 encryption round-trips.
   size_t ciphertext_len = 0;
-  if (!RSA_encrypt(key.get(), &ciphertext_len, ciphertext, sizeof(ciphertext),
-                   kPlaintext, kPlaintextLen, RSA_PKCS1_PADDING) ||
-      ciphertext_len != RSA_size(key.get())) {
-    fprintf(stderr, "PKCS#1 v1.5 encryption failed!\n");
-    return false;
-  }
+  ASSERT_TRUE(RSA_encrypt(key.get(), &ciphertext_len, ciphertext,
+                          sizeof(ciphertext), kPlaintext, kPlaintextLen,
+                          RSA_PKCS1_PADDING));
+  EXPECT_EQ(RSA_size(key.get()), ciphertext_len);
 
   uint8_t plaintext[256];
   size_t plaintext_len = 0;
-  if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
-                   ciphertext, ciphertext_len, RSA_PKCS1_PADDING) ||
-      plaintext_len != kPlaintextLen ||
-      OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) {
-    fprintf(stderr, "PKCS#1 v1.5 decryption failed!\n");
-    return false;
-  }
+  ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+                          sizeof(plaintext), ciphertext, ciphertext_len,
+                          RSA_PKCS1_PADDING));
+  EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len));
 
+  // Test that OAEP encryption round-trips.
   ciphertext_len = 0;
-  if (!RSA_encrypt(key.get(), &ciphertext_len, ciphertext, sizeof(ciphertext),
-                   kPlaintext, kPlaintextLen, RSA_PKCS1_OAEP_PADDING) ||
-      ciphertext_len != RSA_size(key.get())) {
-    fprintf(stderr, "OAEP encryption failed!\n");
-    return false;
-  }
+  ASSERT_TRUE(RSA_encrypt(key.get(), &ciphertext_len, ciphertext,
+                          sizeof(ciphertext), kPlaintext, kPlaintextLen,
+                          RSA_PKCS1_OAEP_PADDING));
+  EXPECT_EQ(RSA_size(key.get()), ciphertext_len);
 
   plaintext_len = 0;
-  if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
-                   ciphertext, ciphertext_len, RSA_PKCS1_OAEP_PADDING) ||
-      plaintext_len != kPlaintextLen ||
-      OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) {
-    fprintf(stderr, "OAEP decryption (encrypted data) failed!\n");
-    return false;
-  }
+  ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+                          sizeof(plaintext), ciphertext, ciphertext_len,
+                          RSA_PKCS1_OAEP_PADDING));
+  EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len));
 
   // |oaep_ciphertext| should decrypt to |kPlaintext|.
   plaintext_len = 0;
-  if (!RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
-                   oaep_ciphertext, oaep_ciphertext_len,
-                   RSA_PKCS1_OAEP_PADDING) ||
-      plaintext_len != kPlaintextLen ||
-      OPENSSL_memcmp(plaintext, kPlaintext, plaintext_len) != 0) {
-    fprintf(stderr, "OAEP decryption (test vector data) failed!\n");
-    return false;
-  }
+  ASSERT_TRUE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+                          sizeof(plaintext), param.oaep_ciphertext,
+                          param.oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING));
+  EXPECT_EQ(Bytes(kPlaintext, kPlaintextLen), Bytes(plaintext, plaintext_len));
 
   // Try decrypting corrupted ciphertexts.
-  OPENSSL_memcpy(ciphertext, oaep_ciphertext, oaep_ciphertext_len);
-  for (size_t i = 0; i < oaep_ciphertext_len; i++) {
+  OPENSSL_memcpy(ciphertext, param.oaep_ciphertext, param.oaep_ciphertext_len);
+  for (size_t i = 0; i < param.oaep_ciphertext_len; i++) {
+    SCOPED_TRACE(i);
     ciphertext[i] ^= 1;
-    if (RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
-                    ciphertext, oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING)) {
-      fprintf(stderr, "Corrupt data decrypted!\n");
-      return false;
-    }
+    EXPECT_FALSE(RSA_decrypt(
+        key.get(), &plaintext_len, plaintext, sizeof(plaintext), ciphertext,
+        param.oaep_ciphertext_len, RSA_PKCS1_OAEP_PADDING));
     ERR_clear_error();
     ciphertext[i] ^= 1;
   }
 
   // Test truncated ciphertexts.
-  for (size_t len = 0; len < oaep_ciphertext_len; len++) {
-    if (RSA_decrypt(key.get(), &plaintext_len, plaintext, sizeof(plaintext),
-                    ciphertext, len, RSA_PKCS1_OAEP_PADDING)) {
-      fprintf(stderr, "Corrupt data decrypted!\n");
-      return false;
-    }
+  for (size_t len = 0; len < param.oaep_ciphertext_len; len++) {
+    SCOPED_TRACE(len);
+    EXPECT_FALSE(RSA_decrypt(key.get(), &plaintext_len, plaintext,
+                             sizeof(plaintext), ciphertext, len,
+                             RSA_PKCS1_OAEP_PADDING));
     ERR_clear_error();
   }
-
-  return true;
 }
 
-static bool TestMultiPrimeKey(int nprimes, const uint8_t *der, size_t der_size,
-                              const uint8_t *enc, size_t enc_size) {
-  bssl::UniquePtr<RSA> rsa(d2i_RSAPrivateKey(nullptr, &der, der_size));
-  if (!rsa) {
-    fprintf(stderr, "%d-prime key failed to parse.\n", nprimes);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+INSTANTIATE_TEST_CASE_P(, RSAEncryptTest, testing::ValuesIn(kRSAEncryptParams));
 
-  if (!RSA_check_key(rsa.get())) {
-    fprintf(stderr, "RSA_check_key failed for %d-prime key.\n", nprimes);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+struct RSAMultiPrimeParam {
+  const uint8_t *der;
+  size_t der_size;
+  const uint8_t *enc;
+  size_t enc_size;
+} kRSAMultiPrimeParams[] = {
+    {kTwoPrimeKey, sizeof(kTwoPrimeKey) - 1, kTwoPrimeEncryptedMessage,
+     sizeof(kTwoPrimeEncryptedMessage)},
+    {kThreePrimeKey, sizeof(kThreePrimeKey) - 1, kThreePrimeEncryptedMessage,
+     sizeof(kThreePrimeEncryptedMessage)},
+    {kSixPrimeKey, sizeof(kSixPrimeKey) - 1, kSixPrimeEncryptedMessage,
+     sizeof(kSixPrimeEncryptedMessage)},
+};
+
+class RSAMultiPrimeTest : public testing::TestWithParam<RSAMultiPrimeParam> {};
+
+TEST_P(RSAMultiPrimeTest, TestDecrypt) {
+  const auto &param = GetParam();
+  bssl::UniquePtr<RSA> rsa(
+      RSA_private_key_from_bytes(param.der, param.der_size));
+  ASSERT_TRUE(rsa);
+
+  EXPECT_TRUE(RSA_check_key(rsa.get()));
 
   uint8_t out[256];
   size_t out_len;
-  if (!RSA_decrypt(rsa.get(), &out_len, out, sizeof(out), enc, enc_size,
-                   RSA_PKCS1_PADDING) ||
-      out_len != 11 ||
-      OPENSSL_memcmp(out, "hello world", 11) != 0) {
-    fprintf(stderr, "%d-prime key failed to decrypt.\n", nprimes);
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  return true;
+  ASSERT_TRUE(RSA_decrypt(rsa.get(), &out_len, out, sizeof(out), param.enc,
+                          param.enc_size, RSA_PKCS1_PADDING));
+  EXPECT_EQ(Bytes("hello world"), Bytes(out, out_len));
 }
 
-static bool TestMultiPrimeKeygen() {
-  static const char kMessage[] = "Hello world.";
-  static const size_t kBits = 1024;
-  uint8_t encrypted[kBits / 8], decrypted[kBits / 8];
-  size_t encrypted_len, decrypted_len;
+INSTANTIATE_TEST_CASE_P(, RSAMultiPrimeTest,
+                        testing::ValuesIn(kRSAMultiPrimeParams));
 
+TEST(RSATest, MultiPrimeKeygen) {
   bssl::UniquePtr<RSA> rsa(RSA_new());
   bssl::UniquePtr<BIGNUM> e(BN_new());
-  if (!rsa || !e ||
-      !BN_set_word(e.get(), RSA_F4) ||
-      !RSA_generate_multi_prime_key(rsa.get(), kBits, 3, e.get(), nullptr) ||
-      !RSA_check_key(rsa.get()) ||
-      !RSA_encrypt(rsa.get(), &encrypted_len, encrypted, sizeof(encrypted),
-                   (const uint8_t *)kMessage, sizeof(kMessage),
-                   RSA_PKCS1_PADDING) ||
-      !RSA_decrypt(rsa.get(), &decrypted_len, decrypted, sizeof(decrypted),
-                   encrypted, encrypted_len, RSA_PKCS1_PADDING) ||
-      decrypted_len != sizeof(kMessage) ||
-      OPENSSL_memcmp(decrypted, kMessage, sizeof(kMessage)) != 0) {
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  ASSERT_TRUE(rsa);
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
 
-  return true;
+  // Test key generation.
+  static const size_t kBits = 1024;
+  ASSERT_TRUE(
+      RSA_generate_multi_prime_key(rsa.get(), kBits, 3, e.get(), nullptr));
+  ASSERT_TRUE(RSA_check_key(rsa.get()));
+
+  // Test the key round-trips.
+  static const char kMessage[] = "Hello world.";
+  uint8_t encrypted[kBits / 8], decrypted[kBits / 8];
+  size_t encrypted_len, decrypted_len;
+  ASSERT_TRUE(RSA_encrypt(rsa.get(), &encrypted_len, encrypted,
+                          sizeof(encrypted), (const uint8_t *)kMessage,
+                          sizeof(kMessage), RSA_PKCS1_PADDING));
+  ASSERT_TRUE(RSA_decrypt(rsa.get(), &decrypted_len, decrypted,
+                          sizeof(decrypted), encrypted, encrypted_len,
+                          RSA_PKCS1_PADDING));
+  EXPECT_EQ(Bytes((const uint8_t *)kMessage, sizeof(kMessage)),
+            Bytes(decrypted, decrypted_len));
 }
 
-static bool TestBadKey() {
+TEST(RSATest, BadKey) {
   bssl::UniquePtr<RSA> key(RSA_new());
   bssl::UniquePtr<BIGNUM> e(BN_new());
+  ASSERT_TRUE(key);
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
 
-  if (!key || !e || !BN_set_word(e.get(), RSA_F4)) {
-    return false;
-  }
+  // Generate a bad key.
+  ASSERT_TRUE(RSA_generate_key_ex(key.get(), 512, e.get(), nullptr));
+  ASSERT_TRUE(BN_add(key->p, key->p, BN_value_one()));
 
-  if (!RSA_generate_key_ex(key.get(), 512, e.get(), nullptr)) {
-    fprintf(stderr, "RSA_generate_key_ex failed.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  // Bad keys are detected.
+  EXPECT_FALSE(RSA_check_key(key.get()));
 
-  if (!BN_add(key->p, key->p, BN_value_one())) {
-    fprintf(stderr, "BN error.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  if (RSA_check_key(key.get())) {
-    fprintf(stderr, "RSA_check_key passed with invalid key!\n");
-    return false;
-  }
-
+  // Bad keys may not be parsed.
   uint8_t *der;
   size_t der_len;
-  if (!RSA_private_key_to_bytes(&der, &der_len, key.get())) {
-    fprintf(stderr, "RSA_private_key_to_bytes failed to serialize bad key\n.");
-    return false;
-  }
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, key.get()));
   bssl::UniquePtr<uint8_t> delete_der(der);
-
   key.reset(RSA_private_key_from_bytes(der, der_len));
-  if (key) {
-    fprintf(stderr, "RSA_private_key_from_bytes accepted bad key\n.");
-  }
-
-  ERR_clear_error();
-  return true;
+  EXPECT_FALSE(key);
 }
 
-static bool TestOnlyDGiven() {
+TEST(RSATest, OnlyDGiven) {
   static const char kN[] =
       "00e77bbf3889d4ef36a9a25d4d69f3f632eb4362214c74517da6d6aeaa9bd09ac42b2662"
       "1cd88f3a6eb013772fc3bf9f83914b6467231c630202c35b3e5808c659";
@@ -716,253 +700,134 @@
       "0365db9eb6d73b53b015c40cd8db4de7dd7035c68b5ac1bf786d7a4ee2cea316eaeca21a"
       "73ac365e58713195f2ae9849348525ca855386b6d028e437a9495a01";
 
-  uint8_t buf[64];
-  unsigned buf_len = sizeof(buf);
   bssl::UniquePtr<RSA> key(RSA_new());
-  if (!key ||
-      !BN_hex2bn(&key->n, kN) ||
-      !BN_hex2bn(&key->e, kE) ||
-      !BN_hex2bn(&key->d, kD) ||
-      RSA_size(key.get()) > sizeof(buf)) {
-    return false;
-  }
+  ASSERT_TRUE(key);
+  ASSERT_TRUE(BN_hex2bn(&key->n, kN));
+  ASSERT_TRUE(BN_hex2bn(&key->e, kE));
+  ASSERT_TRUE(BN_hex2bn(&key->d, kD));
 
-  if (!RSA_check_key(key.get())) {
-    fprintf(stderr, "RSA_check_key failed with only n, d, and e given.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  // Keys with only n, e, and d are functional.
+  EXPECT_TRUE(RSA_check_key(key.get()));
 
   const uint8_t kDummyHash[16] = {0};
-
-  if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len,
-                key.get())) {
-    fprintf(stderr, "RSA_sign failed with only n, d, and e given.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len,
-                  key.get())) {
-    fprintf(stderr, "RSA_verify failed with only n, d, and e given.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  uint8_t buf[64];
+  unsigned buf_len = sizeof(buf);
+  ASSERT_LE(RSA_size(key.get()), sizeof(buf));
+  EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+                       &buf_len, key.get()));
+  EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+                         buf_len, key.get()));
 
   // Keys without the public exponent must continue to work when blinding is
   // disabled to support Java's RSAPrivateKeySpec API. See
   // https://bugs.chromium.org/p/boringssl/issues/detail?id=12.
   bssl::UniquePtr<RSA> key2(RSA_new());
-  if (!key2 ||
-      !BN_hex2bn(&key2->n, kN) ||
-      !BN_hex2bn(&key2->d, kD)) {
-    return false;
-  }
+  ASSERT_TRUE(key2);
+  ASSERT_TRUE(BN_hex2bn(&key2->n, kN));
+  ASSERT_TRUE(BN_hex2bn(&key2->d, kD));
   key2->flags |= RSA_FLAG_NO_BLINDING;
 
-  if (RSA_size(key2.get()) > sizeof(buf)) {
-    return false;
-  }
-
-  if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len,
-                key2.get())) {
-    fprintf(stderr, "RSA_sign failed with only n and d given.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
+  ASSERT_LE(RSA_size(key2.get()), sizeof(buf));
+  EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+                       &buf_len, key2.get()));
 
   // Verify the signature with |key|. |key2| has no public exponent.
-  if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len,
-                  key.get())) {
-    fprintf(stderr,
-            "Could not verify signature produced from key with only n and d "
-            "given.\n");
-    ERR_print_errors_fp(stderr);
-    return false;
-  }
-
-  return true;
+  EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+                         buf_len, key.get()));
 }
 
-static bool TestRecoverCRTParams() {
+TEST(RSATest, RecoverCRTParams) {
   bssl::UniquePtr<BIGNUM> e(BN_new());
-  if (!e || !BN_set_word(e.get(), RSA_F4)) {
-    return false;
-  }
+  ASSERT_TRUE(e);
+  ASSERT_TRUE(BN_set_word(e.get(), RSA_F4));
 
-  ERR_clear_error();
+  bssl::UniquePtr<RSA> key1(RSA_new());
+  ASSERT_TRUE(key1);
+  ASSERT_TRUE(RSA_generate_key_ex(key1.get(), 512, e.get(), nullptr));
 
-  for (unsigned i = 0; i < 1; i++) {
-    bssl::UniquePtr<RSA> key1(RSA_new());
-    if (!key1 ||
-        !RSA_generate_key_ex(key1.get(), 512, e.get(), nullptr)) {
-      fprintf(stderr, "RSA_generate_key_ex failed.\n");
-      ERR_print_errors_fp(stderr);
-      return false;
-    }
+  EXPECT_TRUE(RSA_check_key(key1.get()));
 
-    if (!RSA_check_key(key1.get())) {
-      fprintf(stderr, "RSA_check_key failed with original key.\n");
-      ERR_print_errors_fp(stderr);
-      return false;
-    }
+  // Create a copy of the key without CRT parameters.
+  bssl::UniquePtr<RSA> key2(RSA_new());
+  ASSERT_TRUE(key2);
+  key2->n = BN_dup(key1->n);
+  key2->e = BN_dup(key1->e);
+  key2->d = BN_dup(key1->d);
+  ASSERT_TRUE(key2->n);
+  ASSERT_TRUE(key2->e);
+  ASSERT_TRUE(key2->d);
 
-    bssl::UniquePtr<RSA> key2(RSA_new());
-    if (!key2) {
-      return false;
-    }
-    key2->n = BN_dup(key1->n);
-    key2->e = BN_dup(key1->e);
-    key2->d = BN_dup(key1->d);
-    if (key2->n == nullptr || key2->e == nullptr || key2->d == nullptr) {
-      return false;
-    }
+  ASSERT_TRUE(RSA_recover_crt_params(key2.get()));
 
-    if (!RSA_recover_crt_params(key2.get())) {
-      fprintf(stderr, "RSA_recover_crt_params failed.\n");
-      ERR_print_errors_fp(stderr);
-      return false;
-    }
+  // The recovered RSA parameters should work.
+  EXPECT_TRUE(RSA_check_key(key2.get()));
 
-    uint8_t buf[128];
-    unsigned buf_len = sizeof(buf);
-    if (RSA_size(key2.get()) > buf_len) {
-      return false;
-    }
+  uint8_t buf[128];
+  unsigned buf_len = sizeof(buf);
+  ASSERT_LE(RSA_size(key2.get()), buf_len);
 
-    if (!RSA_check_key(key2.get())) {
-      fprintf(stderr, "RSA_check_key failed with recovered key.\n");
-      ERR_print_errors_fp(stderr);
-      return false;
-    }
-
-    const uint8_t kDummyHash[16] = {0};
-    if (!RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, &buf_len,
-                  key2.get())) {
-      fprintf(stderr, "RSA_sign failed with recovered key.\n");
-      ERR_print_errors_fp(stderr);
-      return false;
-    }
-
-    if (!RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf, buf_len,
-                    key2.get())) {
-      fprintf(stderr, "RSA_verify failed with recovered key.\n");
-      ERR_print_errors_fp(stderr);
-      return false;
-    }
-  }
-
-  return true;
+  const uint8_t kDummyHash[16] = {0};
+  EXPECT_TRUE(RSA_sign(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+                       &buf_len, key2.get()));
+  EXPECT_TRUE(RSA_verify(NID_sha256, kDummyHash, sizeof(kDummyHash), buf,
+                         buf_len, key2.get()));
 }
 
-static bool TestASN1() {
+TEST(RSATest, ASN1) {
   // Test that private keys may be decoded.
-  bssl::UniquePtr<RSA> rsa(RSA_private_key_from_bytes(kKey1, sizeof(kKey1) - 1));
-  if (!rsa) {
-    return false;
-  }
+  bssl::UniquePtr<RSA> rsa(
+      RSA_private_key_from_bytes(kKey1, sizeof(kKey1) - 1));
+  ASSERT_TRUE(rsa);
 
   // Test that the serialization round-trips.
   uint8_t *der;
   size_t der_len;
-  if (!RSA_private_key_to_bytes(&der, &der_len, rsa.get())) {
-    return false;
-  }
+  ASSERT_TRUE(RSA_private_key_to_bytes(&der, &der_len, rsa.get()));
   bssl::UniquePtr<uint8_t> delete_der(der);
-  if (der_len != sizeof(kKey1) - 1 ||
-      OPENSSL_memcmp(der, kKey1, der_len) != 0) {
-    return false;
-  }
+  EXPECT_EQ(Bytes(kKey1, sizeof(kKey1) - 1), Bytes(der, der_len));
 
   // Test that serializing public keys works.
-  if (!RSA_public_key_to_bytes(&der, &der_len, rsa.get())) {
-    return false;
-  }
+  ASSERT_TRUE(RSA_public_key_to_bytes(&der, &der_len, rsa.get()));
   delete_der.reset(der);
 
   // Public keys may be parsed back out.
   rsa.reset(RSA_public_key_from_bytes(der, der_len));
-  if (!rsa || rsa->p != NULL || rsa->q != NULL) {
-    return false;
-  }
+  ASSERT_TRUE(rsa);
+  EXPECT_FALSE(rsa->p);
+  EXPECT_FALSE(rsa->q);
 
   // Serializing the result round-trips.
   uint8_t *der2;
   size_t der2_len;
-  if (!RSA_public_key_to_bytes(&der2, &der2_len, rsa.get())) {
-    return false;
-  }
+  ASSERT_TRUE(RSA_public_key_to_bytes(&der2, &der2_len, rsa.get()));
   bssl::UniquePtr<uint8_t> delete_der2(der2);
-  if (der_len != der2_len || OPENSSL_memcmp(der, der2, der_len) != 0) {
-    return false;
-  }
+  EXPECT_EQ(Bytes(der, der_len), Bytes(der2, der2_len));
 
   // Public keys cannot be serialized as private keys.
-  if (RSA_private_key_to_bytes(&der, &der_len, rsa.get())) {
+  int ok = RSA_private_key_to_bytes(&der, &der_len, rsa.get());
+  if (ok) {
     OPENSSL_free(der);
-    return false;
   }
+  EXPECT_FALSE(ok);
   ERR_clear_error();
 
   // Public keys with negative moduli are invalid.
   rsa.reset(RSA_public_key_from_bytes(kEstonianRSAKey,
                                       sizeof(kEstonianRSAKey)));
-  if (rsa) {
-    return false;
-  }
+  EXPECT_FALSE(rsa);
   ERR_clear_error();
 
   // But |RSA_parse_public_key_buggy| will accept it.
   CBS cbs;
   CBS_init(&cbs, kEstonianRSAKey, sizeof(kEstonianRSAKey));
   rsa.reset(RSA_parse_public_key_buggy(&cbs));
-  if (!rsa || CBS_len(&cbs) != 0) {
-    return false;
-  }
-
-  return true;
+  EXPECT_TRUE(rsa);
+  EXPECT_EQ(0u, CBS_len(&cbs));
 }
 
-static bool TestBadExponent() {
-  bssl::UniquePtr<RSA> rsa(RSA_public_key_from_bytes(kExponent1RSAKey,
-                                          sizeof(kExponent1RSAKey)));
-
-  if (rsa) {
-    fprintf(stderr, "kExponent1RSAKey parsed but should have failed.\n");
-    return false;
-  }
-
+TEST(RSATest, BadExponent) {
+  bssl::UniquePtr<RSA> rsa(
+      RSA_public_key_from_bytes(kExponent1RSAKey, sizeof(kExponent1RSAKey)));
+  EXPECT_FALSE(rsa);
   ERR_clear_error();
-  return true;
-}
-
-int main(int argc, char *argv[]) {
-  CRYPTO_library_init();
-
-  if (!TestRSA(kKey1, sizeof(kKey1) - 1, kOAEPCiphertext1,
-               sizeof(kOAEPCiphertext1) - 1) ||
-      !TestRSA(kKey2, sizeof(kKey2) - 1, kOAEPCiphertext2,
-               sizeof(kOAEPCiphertext2) - 1) ||
-      !TestRSA(kKey3, sizeof(kKey3) - 1, kOAEPCiphertext3,
-               sizeof(kOAEPCiphertext3) - 1) ||
-      !TestOnlyDGiven() ||
-      !TestRecoverCRTParams() ||
-      !TestBadKey() ||
-      !TestMultiPrimeKey(2, kTwoPrimeKey, sizeof(kTwoPrimeKey) - 1,
-                            kTwoPrimeEncryptedMessage,
-                            sizeof(kTwoPrimeEncryptedMessage)) ||
-      !TestMultiPrimeKey(3, kThreePrimeKey, sizeof(kThreePrimeKey) - 1,
-                            kThreePrimeEncryptedMessage,
-                            sizeof(kThreePrimeEncryptedMessage)) ||
-      !TestMultiPrimeKey(6, kSixPrimeKey, sizeof(kSixPrimeKey) - 1,
-                            kSixPrimeEncryptedMessage,
-                            sizeof(kSixPrimeEncryptedMessage)) ||
-      !TestMultiPrimeKeygen() ||
-      !TestASN1() ||
-      !TestBadExponent()) {
-    return 1;
-  }
-
-  printf("PASS\n");
-  return 0;
 }
diff --git a/src/crypto/sha/asm/sha1-586.pl b/src/crypto/sha/asm/sha1-586.pl
index e815e2b..acf383d 100644
--- a/src/crypto/sha/asm/sha1-586.pl
+++ b/src/crypto/sha/asm/sha1-586.pl
@@ -97,10 +97,12 @@
 # Sandy Bridge	8.8		6.2/+40%	5.1(**)/+73%
 # Ivy Bridge	7.2		4.8/+51%	4.7(**)/+53%
 # Haswell	6.5		4.3/+51%	4.1(**)/+58%
+# Skylake	6.4		4.1/+55%	4.1(**)/+55%
 # Bulldozer	11.6		6.0/+92%
 # VIA Nano	10.6		7.5/+41%
 # Atom		12.5		9.3(*)/+35%
 # Silvermont	14.5		9.9(*)/+46%
+# Goldmont	8.8		6.7/+30%	1.7(***)/+415%
 #
 # (*)	Loop is 1056 instructions long and expected result is ~8.25.
 #	The discrepancy is because of front-end limitations, so
@@ -108,6 +110,8 @@
 #	limited parallelism.
 #
 # (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
+#
+# (***)	SHAEXT result
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
diff --git a/src/crypto/sha/asm/sha1-x86_64.pl b/src/crypto/sha/asm/sha1-x86_64.pl
old mode 100644
new mode 100755
index ff960bb..9a13f6c
--- a/src/crypto/sha/asm/sha1-x86_64.pl
+++ b/src/crypto/sha/asm/sha1-x86_64.pl
@@ -73,13 +73,16 @@
 # Sandy Bridge	7.70		6.10/+26%	4.99/+54%
 # Ivy Bridge	6.06		4.67/+30%	4.60/+32%
 # Haswell	5.45		4.15/+31%	3.57/+53%
+# Skylake	5.18		4.06/+28%	3.54/+46%
 # Bulldozer	9.11		5.95/+53%
 # VIA Nano	9.32		7.15/+30%
 # Atom		10.3		9.17/+12%
 # Silvermont	13.1(*)		9.37/+40%
+# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
 #
 # (*)	obviously suboptimal result, nothing was done about it,
 #	because SSSE3 code is compiled unconditionally;
+# (**)	SHAEXT result
 
 $flavour = shift;
 $output  = shift;
@@ -246,7 +249,7 @@
 	jz	.Lialu
 ___
 $code.=<<___ if ($shaext);
-	test	\$`1<<29`,%r10d		# check SHA bit	
+	test	\$`1<<29`,%r10d		# check SHA bit
 	jnz	_shaext_shortcut
 ___
 $code.=<<___ if ($avx>1);
@@ -444,7 +447,8 @@
 my @T=("%esi","%edi");
 my $j=0;
 my $rx=0;
-my $K_XX_XX="%r11";
+my $K_XX_XX="%r14";
+my $fp="%r11";
 
 my $_rol=sub { &rol(@_) };
 my $_ror=sub { &ror(@_) };
@@ -465,7 +469,7 @@
 .align	16
 sha1_block_data_order_ssse3:
 _ssse3_shortcut:
-	mov	%rsp,%rax
+	mov	%rsp,$fp	# frame pointer
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -474,16 +478,15 @@
 	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
 ___
 $code.=<<___ if ($win64);
-	movaps	%xmm6,-40-6*16(%rax)
-	movaps	%xmm7,-40-5*16(%rax)
-	movaps	%xmm8,-40-4*16(%rax)
-	movaps	%xmm9,-40-3*16(%rax)
-	movaps	%xmm10,-40-2*16(%rax)
-	movaps	%xmm11,-40-1*16(%rax)
+	movaps	%xmm6,-40-6*16($fp)
+	movaps	%xmm7,-40-5*16($fp)
+	movaps	%xmm8,-40-4*16($fp)
+	movaps	%xmm9,-40-3*16($fp)
+	movaps	%xmm10,-40-2*16($fp)
+	movaps	%xmm11,-40-1*16($fp)
 .Lprologue_ssse3:
 ___
 $code.=<<___;
-	mov	%rax,%r14	# original %rsp
 	and	\$-64,%rsp
 	mov	%rdi,$ctx	# reassigned argument
 	mov	%rsi,$inp	# reassigned argument
@@ -890,21 +893,20 @@
 	mov	$E,16($ctx)
 ___
 $code.=<<___ if ($win64);
-	movaps	-40-6*16(%r14),%xmm6
-	movaps	-40-5*16(%r14),%xmm7
-	movaps	-40-4*16(%r14),%xmm8
-	movaps	-40-3*16(%r14),%xmm9
-	movaps	-40-2*16(%r14),%xmm10
-	movaps	-40-1*16(%r14),%xmm11
+	movaps	-40-6*16($fp),%xmm6
+	movaps	-40-5*16($fp),%xmm7
+	movaps	-40-4*16($fp),%xmm8
+	movaps	-40-3*16($fp),%xmm9
+	movaps	-40-2*16($fp),%xmm10
+	movaps	-40-1*16($fp),%xmm11
 ___
 $code.=<<___;
-	lea	(%r14),%rsi
-	mov	-40(%rsi),%r14
-	mov	-32(%rsi),%r13
-	mov	-24(%rsi),%r12
-	mov	-16(%rsi),%rbp
-	mov	-8(%rsi),%rbx
-	lea	(%rsi),%rsp
+	mov	-40($fp),%r14
+	mov	-32($fp),%r13
+	mov	-24($fp),%r12
+	mov	-16($fp),%rbp
+	mov	-8($fp),%rbx
+	lea	($fp),%rsp
 .Lepilogue_ssse3:
 	ret
 .size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
@@ -927,7 +929,7 @@
 .align	16
 sha1_block_data_order_avx:
 _avx_shortcut:
-	mov	%rsp,%rax
+	mov	%rsp,$fp
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -937,16 +939,15 @@
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
-	vmovaps	%xmm6,-40-6*16(%rax)
-	vmovaps	%xmm7,-40-5*16(%rax)
-	vmovaps	%xmm8,-40-4*16(%rax)
-	vmovaps	%xmm9,-40-3*16(%rax)
-	vmovaps	%xmm10,-40-2*16(%rax)
-	vmovaps	%xmm11,-40-1*16(%rax)
+	vmovaps	%xmm6,-40-6*16($fp)
+	vmovaps	%xmm7,-40-5*16($fp)
+	vmovaps	%xmm8,-40-4*16($fp)
+	vmovaps	%xmm9,-40-3*16($fp)
+	vmovaps	%xmm10,-40-2*16($fp)
+	vmovaps	%xmm11,-40-1*16($fp)
 .Lprologue_avx:
 ___
 $code.=<<___;
-	mov	%rax,%r14	# original %rsp
 	and	\$-64,%rsp
 	mov	%rdi,$ctx	# reassigned argument
 	mov	%rsi,$inp	# reassigned argument
@@ -1254,21 +1255,20 @@
 	mov	$E,16($ctx)
 ___
 $code.=<<___ if ($win64);
-	movaps	-40-6*16(%r14),%xmm6
-	movaps	-40-5*16(%r14),%xmm7
-	movaps	-40-4*16(%r14),%xmm8
-	movaps	-40-3*16(%r14),%xmm9
-	movaps	-40-2*16(%r14),%xmm10
-	movaps	-40-1*16(%r14),%xmm11
+	movaps	-40-6*16($fp),%xmm6
+	movaps	-40-5*16($fp),%xmm7
+	movaps	-40-4*16($fp),%xmm8
+	movaps	-40-3*16($fp),%xmm9
+	movaps	-40-2*16($fp),%xmm10
+	movaps	-40-1*16($fp),%xmm11
 ___
 $code.=<<___;
-	lea	(%r14),%rsi
-	mov	-40(%rsi),%r14
-	mov	-32(%rsi),%r13
-	mov	-24(%rsi),%r12
-	mov	-16(%rsi),%rbp
-	mov	-8(%rsi),%rbx
-	lea	(%rsi),%rsp
+	mov	-40($fp),%r14
+	mov	-32($fp),%r13
+	mov	-24($fp),%r12
+	mov	-16($fp),%rbp
+	mov	-8($fp),%rbx
+	lea	($fp),%rsp
 .Lepilogue_avx:
 	ret
 .size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
@@ -1294,7 +1294,7 @@
 .align	16
 sha1_block_data_order_avx2:
 _avx2_shortcut:
-	mov	%rsp,%rax
+	mov	%rsp,$fp
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1304,16 +1304,15 @@
 ___
 $code.=<<___ if ($win64);
 	lea	-6*16(%rsp),%rsp
-	vmovaps	%xmm6,-40-6*16(%rax)
-	vmovaps	%xmm7,-40-5*16(%rax)
-	vmovaps	%xmm8,-40-4*16(%rax)
-	vmovaps	%xmm9,-40-3*16(%rax)
-	vmovaps	%xmm10,-40-2*16(%rax)
-	vmovaps	%xmm11,-40-1*16(%rax)
+	vmovaps	%xmm6,-40-6*16($fp)
+	vmovaps	%xmm7,-40-5*16($fp)
+	vmovaps	%xmm8,-40-4*16($fp)
+	vmovaps	%xmm9,-40-3*16($fp)
+	vmovaps	%xmm10,-40-2*16($fp)
+	vmovaps	%xmm11,-40-1*16($fp)
 .Lprologue_avx2:
 ___
 $code.=<<___;
-	mov	%rax,%r14		# original %rsp
 	mov	%rdi,$ctx		# reassigned argument
 	mov	%rsi,$inp		# reassigned argument
 	mov	%rdx,$num		# reassigned argument
@@ -1733,21 +1732,20 @@
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
-	movaps	-40-6*16(%r14),%xmm6
-	movaps	-40-5*16(%r14),%xmm7
-	movaps	-40-4*16(%r14),%xmm8
-	movaps	-40-3*16(%r14),%xmm9
-	movaps	-40-2*16(%r14),%xmm10
-	movaps	-40-1*16(%r14),%xmm11
+	movaps	-40-6*16($fp),%xmm6
+	movaps	-40-5*16($fp),%xmm7
+	movaps	-40-4*16($fp),%xmm8
+	movaps	-40-3*16($fp),%xmm9
+	movaps	-40-2*16($fp),%xmm10
+	movaps	-40-1*16($fp),%xmm11
 ___
 $code.=<<___;
-	lea	(%r14),%rsi
-	mov	-40(%rsi),%r14
-	mov	-32(%rsi),%r13
-	mov	-24(%rsi),%r12
-	mov	-16(%rsi),%rbp
-	mov	-8(%rsi),%rbx
-	lea	(%rsi),%rsp
+	mov	-40($fp),%r14
+	mov	-32($fp),%r13
+	mov	-24($fp),%r12
+	mov	-16($fp),%rbp
+	mov	-8($fp),%rbx
+	lea	($fp),%rsp
 .Lepilogue_avx2:
 	ret
 .size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
@@ -1890,15 +1888,13 @@
 	cmp	%r10,%rbx		# context->Rip<prologue label
 	jb	.Lcommon_seh_tail
 
-	mov	152($context),%rax	# pull context->Rsp
+	mov	208($context),%rax	# pull context->R11
 
 	mov	4(%r11),%r10d		# HandlerData[1]
 	lea	(%rsi,%r10),%r10	# epilogue label
 	cmp	%r10,%rbx		# context->Rip>=epilogue label
 	jae	.Lcommon_seh_tail
 
-	mov	232($context),%rax	# pull context->R14
-
 	lea	-40-6*16(%rax),%rsi
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$12,%ecx
diff --git a/src/crypto/sha/asm/sha256-586.pl b/src/crypto/sha/asm/sha256-586.pl
index 8f4311b..d85004c 100644
--- a/src/crypto/sha/asm/sha256-586.pl
+++ b/src/crypto/sha/asm/sha256-586.pl
@@ -40,7 +40,7 @@
 #
 # Performance in clock cycles per processed byte (less is better):
 #
-#		gcc	icc	x86 asm(*)	SIMD	x86_64 asm(**)	
+#		gcc	icc	x86 asm(*)	SIMD	x86_64 asm(**)
 # Pentium	46	57	40/38		-	-
 # PIII		36	33	27/24		-	-
 # P4		41	38	28		-	17.3
@@ -50,14 +50,17 @@
 # Sandy Bridge	25	-	15.9		12.4	11.6
 # Ivy Bridge	24	-	15.0		11.4	10.3
 # Haswell	22	-	13.9		9.46	7.80
+# Skylake	20	-	14.9		9.50	7.70
 # Bulldozer	36	-	27/22		17.0	13.6
 # VIA Nano	36	-	25/22		16.8	16.5
 # Atom		50	-	30/25		21.9	18.9
 # Silvermont	40	-	34/31		22.9	20.6
+# Goldmont	29	-	20		16.3(***)
 #
 # (*)	numbers after slash are for unrolled loop, where applicable;
 # (**)	x86_64 assembly performance is presented for reference
 #	purposes, results are best-available;
+# (***)	SHAEXT result is 4.1, strangely enough better than 64-bit one;
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
@@ -263,7 +266,7 @@
 	&mov	($Coff,"ecx");
 	&mov	($Doff,"edi");
 	&mov	(&DWP(0,"esp"),"ebx");	# magic
-	&mov	($E,&DWP(16,"esi"));	
+	&mov	($E,&DWP(16,"esi"));
 	&mov	("ebx",&DWP(20,"esi"));
 	&mov	("ecx",&DWP(24,"esi"));
 	&mov	("edi",&DWP(28,"esi"));
@@ -372,7 +375,7 @@
 	&xor	($AH[1],"ecx");		# magic
 	&mov	(&DWP(8,"esp"),"ecx");
 	&mov	(&DWP(12,"esp"),"ebx");
-	&mov	($E,&DWP(16,"esi"));	
+	&mov	($E,&DWP(16,"esi"));
 	&mov	("ebx",&DWP(20,"esi"));
 	&mov	("ecx",&DWP(24,"esi"));
 	&mov	("esi",&DWP(28,"esi"));
diff --git a/src/crypto/sha/asm/sha512-586.pl b/src/crypto/sha/asm/sha512-586.pl
index d0f9101..6d909ed 100644
--- a/src/crypto/sha/asm/sha512-586.pl
+++ b/src/crypto/sha/asm/sha512-586.pl
@@ -25,10 +25,12 @@
 # Sandy Bridge	58	-	35	11.9	11.2
 # Ivy Bridge	50	-	33	11.5	8.17
 # Haswell	46	-	29	11.3	7.66
+# Skylake	40	-	26	13.3	7.25
 # Bulldozer	121	-	50	14.0	13.5
 # VIA Nano	91	-	52	33	14.7
 # Atom		126	-	68	48(***)	14.7
 # Silvermont	97	-	58	42(***)	17.5
+# Goldmont	80	-	48	19.5	12.0
 #
 # (*)	whichever best applicable.
 # (**)	x86_64 assembler performance is presented for reference
@@ -376,7 +378,7 @@
 
 &set_label("16_79_sse2",16);
     for ($j=0;$j<2;$j++) {			# 2x unroll
-	#&movq	("mm7",&QWP(8*(9+16-1),"esp"));	# prefetched in BODY_00_15 
+	#&movq	("mm7",&QWP(8*(9+16-1),"esp"));	# prefetched in BODY_00_15
 	&movq	("mm5",&QWP(8*(9+16-14),"esp"));
 	&movq	("mm1","mm7");
 	&psrlq	("mm7",1);
diff --git a/src/crypto/sha/asm/sha512-armv8.pl b/src/crypto/sha/asm/sha512-armv8.pl
index 75d4043..494e633 100644
--- a/src/crypto/sha/asm/sha512-armv8.pl
+++ b/src/crypto/sha/asm/sha512-armv8.pl
@@ -18,7 +18,7 @@
 # Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
 # Denver	2.01		10.5 (+26%)	6.70 (+8%)
 # X-Gene			20.0 (+100%)	12.8 (+300%(***))
-# 
+#
 # (*)	Software SHA256 results are of lesser relevance, presented
 #	mostly for informational purposes.
 # (**)	The result is a trade-off: it's possible to improve it by
diff --git a/src/crypto/sha/asm/sha512-x86_64.pl b/src/crypto/sha/asm/sha512-x86_64.pl
old mode 100644
new mode 100755
index 186aa9a..5716791
--- a/src/crypto/sha/asm/sha512-x86_64.pl
+++ b/src/crypto/sha/asm/sha512-x86_64.pl
@@ -34,7 +34,7 @@
 # level parallelism, on a given CPU implementation in this case.
 #
 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
-# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
+# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
 # [currently available] EM64T CPUs apparently are far from it. On the
 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
 # sha256_block:-( This is presumably because 64-bit shifts/rotates
@@ -86,12 +86,14 @@
 # Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
 # Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
 # Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
+# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
 # Atom		23.0	18.9(+22%)  -		    14.7    -
 # Silvermont	27.4	20.6(+33%)  -               17.5    -
+# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
 #
-# (*)	whichever best applicable;
+# (*)	whichever best applicable, including SHAEXT;
 # (**)	switch from ror to shrd stands for fair share of improvement;
 # (***)	execution time is fully determined by remaining integer-only
 #	part, body_00_15; reducing the amount of SIMD instructions
@@ -284,13 +286,13 @@
 	jnz	.Lssse3_shortcut
 ___
 $code.=<<___;
+	mov	%rsp,%rax		# copy %rsp
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
 	shl	\$4,%rdx		# num*16
 	sub	\$$framesz,%rsp
 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
@@ -298,7 +300,7 @@
 	mov	$ctx,$_ctx		# save ctx, 1st arg
 	mov	$inp,$_inp		# save inp, 2nd arh
 	mov	%rdx,$_end		# save end pointer, "3rd" arg
-	mov	%r11,$_rsp		# save copy of %rsp
+	mov	%rax,$_rsp		# save copy of %rsp
 .Lprologue:
 
 	mov	$SZ*0($ctx),$A
@@ -365,13 +367,13 @@
 	jb	.Lloop
 
 	mov	$_rsp,%rsi
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lepilogue:
 	ret
 .size	$func,.-$func
@@ -744,13 +746,13 @@
 .align	64
 ${func}_ssse3:
 .Lssse3_shortcut:
+	mov	%rsp,%rax		# copy %rsp
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
 	shl	\$4,%rdx		# num*16
 	sub	\$`$framesz+$win64*16*4`,%rsp
 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
@@ -758,7 +760,7 @@
 	mov	$ctx,$_ctx		# save ctx, 1st arg
 	mov	$inp,$_inp		# save inp, 2nd arh
 	mov	%rdx,$_end		# save end pointer, "3rd" arg
-	mov	%r11,$_rsp		# save copy of %rsp
+	mov	%rax,$_rsp		# save copy of %rsp
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,16*$SZ+32(%rsp)
@@ -1065,13 +1067,13 @@
 	movaps	16*$SZ+80(%rsp),%xmm9
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lepilogue_ssse3:
 	ret
 .size	${func}_ssse3,.-${func}_ssse3
@@ -1088,13 +1090,13 @@
 .align	64
 ${func}_xop:
 .Lxop_shortcut:
+	mov	%rsp,%rax		# copy %rsp
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
 	shl	\$4,%rdx		# num*16
 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
@@ -1102,7 +1104,7 @@
 	mov	$ctx,$_ctx		# save ctx, 1st arg
 	mov	$inp,$_inp		# save inp, 2nd arh
 	mov	%rdx,$_end		# save end pointer, "3rd" arg
-	mov	%r11,$_rsp		# save copy of %rsp
+	mov	%rax,$_rsp		# save copy of %rsp
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,16*$SZ+32(%rsp)
@@ -1442,13 +1444,13 @@
 	movaps	16*$SZ+112(%rsp),%xmm11
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lepilogue_xop:
 	ret
 .size	${func}_xop,.-${func}_xop
@@ -1464,13 +1466,13 @@
 .align	64
 ${func}_avx:
 .Lavx_shortcut:
+	mov	%rsp,%rax		# copy %rsp
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
 	shl	\$4,%rdx		# num*16
 	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
 	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
@@ -1478,7 +1480,7 @@
 	mov	$ctx,$_ctx		# save ctx, 1st arg
 	mov	$inp,$_inp		# save inp, 2nd arh
 	mov	%rdx,$_end		# save end pointer, "3rd" arg
-	mov	%r11,$_rsp		# save copy of %rsp
+	mov	%rax,$_rsp		# save copy of %rsp
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,16*$SZ+32(%rsp)
@@ -1750,13 +1752,13 @@
 	movaps	16*$SZ+112(%rsp),%xmm11
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lepilogue_avx:
 	ret
 .size	${func}_avx,.-${func}_avx
@@ -1766,7 +1768,7 @@
 ######################################################################
 # AVX2+BMI code path
 #
-my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp 
+my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
 my $PUSH8=8*2*$SZ;
 use integer;
 
@@ -1815,13 +1817,13 @@
 .align	64
 ${func}_avx2:
 .Lavx2_shortcut:
+	mov	%rsp,%rax		# copy %rsp
 	push	%rbx
 	push	%rbp
 	push	%r12
 	push	%r13
 	push	%r14
 	push	%r15
-	mov	%rsp,%r11		# copy %rsp
 	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
 	shl	\$4,%rdx		# num*16
 	and	\$-256*$SZ,%rsp		# align stack frame
@@ -1830,7 +1832,7 @@
 	mov	$ctx,$_ctx		# save ctx, 1st arg
 	mov	$inp,$_inp		# save inp, 2nd arh
 	mov	%rdx,$_end		# save end pointer, "3rd" arg
-	mov	%r11,$_rsp		# save copy of %rsp
+	mov	%rax,$_rsp		# save copy of %rsp
 ___
 $code.=<<___ if ($win64);
 	movaps	%xmm6,16*$SZ+32(%rsp)
@@ -2124,13 +2126,13 @@
 	movaps	16*$SZ+112(%rsp),%xmm11
 ___
 $code.=<<___;
-	mov	(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
 .Lepilogue_avx2:
 	ret
 .size	${func}_avx2,.-${func}_avx2
@@ -2192,7 +2194,6 @@
 $code.=<<___;
 	mov	%rax,%rsi		# put aside Rsp
 	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
-	lea	48(%rax),%rax
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
diff --git a/src/crypto/test/test_util.h b/src/crypto/test/test_util.h
index d834973..1447bf6 100644
--- a/src/crypto/test/test_util.h
+++ b/src/crypto/test/test_util.h
@@ -18,6 +18,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 
 #include <iosfwd>
 
@@ -34,6 +35,9 @@
   Bytes(const uint8_t *data_arg, size_t len_arg)
       : data(data_arg), len(len_arg) {}
 
+  Bytes(const char *str)
+      : data(reinterpret_cast<const uint8_t *>(str)), len(strlen(str)) {}
+
   template <size_t N>
   Bytes(const uint8_t (&array)[N]) : data(array), len(N) {}
 
diff --git a/src/crypto/x509/x_name.c b/src/crypto/x509/x_name.c
index f97081d..4abdc91 100644
--- a/src/crypto/x509/x_name.c
+++ b/src/crypto/x509/x_name.c
@@ -229,12 +229,11 @@
 
     if (*val)
         x509_name_ex_free(val, NULL);
-    /* We've decoded it: now cache encoding */
-    if (!x509_name_ex_new(&nm.a, NULL) || !BUF_MEM_grow(nm.x->bytes, p - q)) {
-        sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s,
-                                             local_sk_X509_NAME_ENTRY_pop_free);
+    if (!x509_name_ex_new(&nm.a, NULL))
         goto err;
-    }
+    /* We've decoded it: now cache encoding */
+    if (!BUF_MEM_grow(nm.x->bytes, p - q))
+        goto err;
     OPENSSL_memcpy(nm.x->bytes->data, q, p - q);
 
     /* Convert internal representation to X509_NAME structure */
@@ -245,13 +244,14 @@
             entry->set = i;
             if (!sk_X509_NAME_ENTRY_push(nm.x->entries, entry))
                 goto err;
+            sk_X509_NAME_ENTRY_set(entries, j, NULL);
         }
-        sk_X509_NAME_ENTRY_free(entries);
     }
-    sk_STACK_OF_X509_NAME_ENTRY_free(intname.s);
     ret = x509_name_canon(nm.x);
     if (!ret)
         goto err;
+    sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s,
+                                         local_sk_X509_NAME_ENTRY_free);
     nm.x->modified = 0;
     *val = nm.a;
     *in = p;
@@ -259,6 +259,8 @@
  err:
     if (nm.x != NULL)
         X509_NAME_free(nm.x);
+    sk_STACK_OF_X509_NAME_ENTRY_pop_free(intname.s,
+                                         local_sk_X509_NAME_ENTRY_pop_free);
     OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB);
     return 0;
 }
@@ -307,8 +309,10 @@
             entries = sk_X509_NAME_ENTRY_new_null();
             if (!entries)
                 goto memerr;
-            if (!sk_STACK_OF_X509_NAME_ENTRY_push(intname.s, entries))
+            if (!sk_STACK_OF_X509_NAME_ENTRY_push(intname.s, entries)) {
+                sk_X509_NAME_ENTRY_free(entries);
                 goto memerr;
+            }
             set = entry->set;
         }
         if (!sk_X509_NAME_ENTRY_push(entries, entry))
diff --git a/src/include/openssl/ecdsa.h b/src/include/openssl/ecdsa.h
index 3890744..8a158b8 100644
--- a/src/include/openssl/ecdsa.h
+++ b/src/include/openssl/ecdsa.h
@@ -75,7 +75,7 @@
  * zero otherwise. */
 OPENSSL_EXPORT int ECDSA_sign(int type, const uint8_t *digest,
                               size_t digest_len, uint8_t *sig,
-                              unsigned int *sig_len, EC_KEY *key);
+                              unsigned int *sig_len, const EC_KEY *key);
 
 /* ECDSA_verify verifies that |sig_len| bytes from |sig| constitute a valid
  * signature by |key| of |digest|. (The |type| argument should be zero.) It
@@ -83,7 +83,7 @@
  * occurred. */
 OPENSSL_EXPORT int ECDSA_verify(int type, const uint8_t *digest,
                                 size_t digest_len, const uint8_t *sig,
-                                size_t sig_len, EC_KEY *key);
+                                size_t sig_len, const EC_KEY *key);
 
 /* ECDSA_size returns the maximum size of an ECDSA signature using |key|. It
  * returns zero on error. */
@@ -109,13 +109,13 @@
 /* ECDSA_do_sign signs |digest_len| bytes from |digest| with |key| and returns
  * the resulting signature structure, or NULL on error. */
 OPENSSL_EXPORT ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest,
-                                        size_t digest_len, EC_KEY *key);
+                                        size_t digest_len, const EC_KEY *key);
 
 /* ECDSA_do_verify verifies that |sig| constitutes a valid signature by |key|
  * of |digest|. It returns one on success or zero if the signature is invalid
  * or on error. */
 OPENSSL_EXPORT int ECDSA_do_verify(const uint8_t *digest, size_t digest_len,
-                                   const ECDSA_SIG *sig, EC_KEY *key);
+                                   const ECDSA_SIG *sig, const EC_KEY *key);
 
 
 /* Signing with precomputation.
@@ -128,22 +128,22 @@
 /* ECDSA_sign_setup precomputes parts of an ECDSA signing operation. It sets
  * |*kinv| and |*rp| to the precomputed values and uses the |ctx| argument, if
  * not NULL. It returns one on success and zero otherwise. */
-OPENSSL_EXPORT int ECDSA_sign_setup(EC_KEY *eckey, BN_CTX *ctx, BIGNUM **kinv,
-                                    BIGNUM **rp);
+OPENSSL_EXPORT int ECDSA_sign_setup(const EC_KEY *eckey, BN_CTX *ctx,
+                                    BIGNUM **kinv, BIGNUM **rp);
 
 /* ECDSA_do_sign_ex is the same as |ECDSA_do_sign| but takes precomputed values
  * as generated by |ECDSA_sign_setup|. */
 OPENSSL_EXPORT ECDSA_SIG *ECDSA_do_sign_ex(const uint8_t *digest,
                                            size_t digest_len,
                                            const BIGNUM *kinv, const BIGNUM *rp,
-                                           EC_KEY *eckey);
+                                           const EC_KEY *eckey);
 
 /* ECDSA_sign_ex is the same as |ECDSA_sign| but takes precomputed values as
  * generated by |ECDSA_sign_setup|. */
 OPENSSL_EXPORT int ECDSA_sign_ex(int type, const uint8_t *digest,
                                  size_t digest_len, uint8_t *sig,
                                  unsigned int *sig_len, const BIGNUM *kinv,
-                                 const BIGNUM *rp, EC_KEY *eckey);
+                                 const BIGNUM *rp, const EC_KEY *eckey);
 
 
 /* ASN.1 functions. */
diff --git a/src/include/openssl/pkcs8.h b/src/include/openssl/pkcs8.h
index 141ed8d..70d6f49 100644
--- a/src/include/openssl/pkcs8.h
+++ b/src/include/openssl/pkcs8.h
@@ -66,45 +66,42 @@
 #endif
 
 
-/* PKCS8_encrypt_pbe serializes and encrypts a PKCS8_PRIV_KEY_INFO with PBES1 or
+/* PKCS8_encrypt serializes and encrypts a PKCS8_PRIV_KEY_INFO with PBES1 or
  * PBES2 as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4,
  * pbeWithSHAAnd3-KeyTripleDES-CBC and pbeWithSHA1And40BitRC2, defined in PKCS
  * #12, and PBES2, are supported.  PBES2 is selected by setting |cipher| and
  * passing -1 for |pbe_nid|.  Otherwise, PBES1 is used and |cipher| is ignored.
  *
- * The |pass_raw_len| bytes pointed to by |pass_raw| are used as the password.
- * Note that any conversions from the password as supplied in a text string
- * (such as those specified in B.1 of PKCS #12) must be performed by the caller.
+ * |pass| is used as the password. If a PBES1 scheme from PKCS #12 is used, this
+ * will be converted to a raw byte string as specified in B.1 of PKCS #12. If
+ * |pass| is NULL, it will be encoded as the empty byte string rather than two
+ * zero bytes, the PKCS #12 encoding of the empty string.
  *
  * If |salt| is NULL, a random salt of |salt_len| bytes is generated. If
  * |salt_len| is zero, a default salt length is used instead.
  *
- * The resulting structure is stored in an X509_SIG which must be freed by the
- * caller.
- *
- * TODO(davidben): Really? An X509_SIG? OpenSSL probably did that because it has
- * the same structure as EncryptedPrivateKeyInfo. */
-OPENSSL_EXPORT X509_SIG *PKCS8_encrypt_pbe(int pbe_nid,
-                                           const EVP_CIPHER *cipher,
-                                           const uint8_t *pass_raw,
-                                           size_t pass_raw_len,
-                                           const uint8_t *salt, size_t salt_len,
-                                           int iterations,
-                                           PKCS8_PRIV_KEY_INFO *p8inf);
+ * The resulting structure is stored in an |X509_SIG| which must be freed by the
+ * caller. */
+OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher,
+                                       const char *pass, int pass_len,
+                                       const uint8_t *salt, size_t salt_len,
+                                       int iterations,
+                                       PKCS8_PRIV_KEY_INFO *p8inf);
 
-/* PKCS8_decrypt_pbe decrypts and decodes a PKCS8_PRIV_KEY_INFO with PBES1 or
- * PBES2 as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4,
+/* PKCS8_decrypt decrypts and decodes a PKCS8_PRIV_KEY_INFO with PBES1 or PBES2
+ * as defined in PKCS #5. Only pbeWithSHAAnd128BitRC4,
  * pbeWithSHAAnd3-KeyTripleDES-CBC and pbeWithSHA1And40BitRC2, and PBES2,
  * defined in PKCS #12, are supported.
  *
- * The |pass_raw_len| bytes pointed to by |pass_raw| are used as the password.
- * Note that any conversions from the password as supplied in a text string
- * (such as those specified in B.1 of PKCS #12) must be performed by the caller.
+ * |pass| is used as the password. If a PBES1 scheme from PKCS #12 is used, this
+ * will be converted to a raw byte string as specified in B.1 of PKCS #12. If
+ * |pass| is NULL, it will be encoded as the empty byte string rather than two
+ * zero bytes, the PKCS #12 encoding of the empty string.
  *
  * The resulting structure must be freed by the caller. */
-OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt_pbe(X509_SIG *pkcs8,
-                                                      const uint8_t *pass_raw,
-                                                      size_t pass_raw_len);
+OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8,
+                                                  const char *pass,
+                                                  int pass_len);
 
 /* PKCS12_get_key_and_certs parses a PKCS#12 structure from |in|, authenticates
  * and decrypts it using |password|, sets |*out_key| to the included private
@@ -117,24 +114,6 @@
 
 /* Deprecated functions. */
 
-/* PKCS8_encrypt calls |PKCS8_encrypt_pbe| after (in the PKCS#12 case) treating
- * |pass| as an ASCII string, appending U+0000, and converting to UCS-2. (So the
- * empty password encodes as two NUL bytes.) In the PBES2 case, the password is
- * unchanged.  */
-OPENSSL_EXPORT X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher,
-                                       const char *pass, int pass_len,
-                                       const uint8_t *salt, size_t salt_len,
-                                       int iterations,
-                                       PKCS8_PRIV_KEY_INFO *p8inf);
-
-/* PKCS8_decrypt calls PKCS8_decrypt_pbe after (in the PKCS#12 case) treating
- * |pass| as an ASCII string, appending U+0000, and converting to UCS-2. (So the
- *  empty password encodes as two NUL bytes.) In the PBES2 case, the password is
- * unchanged. */
-OPENSSL_EXPORT PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8,
-                                                  const char *pass,
-                                                  int pass_len);
-
 /* PKCS12_PBE_add does nothing. It exists for compatibility with OpenSSL. */
 OPENSSL_EXPORT void PKCS12_PBE_add(void);
 
diff --git a/src/include/openssl/ssl.h b/src/include/openssl/ssl.h
index 497093d..23e5e9b 100644
--- a/src/include/openssl/ssl.h
+++ b/src/include/openssl/ssl.h
@@ -2241,11 +2241,11 @@
 
 /* SSL_enable_signed_cert_timestamps causes |ssl| (which must be the client end
  * of a connection) to request SCTs from the server. See
- * https://tools.ietf.org/html/rfc6962. It returns one.
+ * https://tools.ietf.org/html/rfc6962.
  *
  * Call |SSL_get0_signed_cert_timestamp_list| to recover the SCT after the
  * handshake. */
-OPENSSL_EXPORT int SSL_enable_signed_cert_timestamps(SSL *ssl);
+OPENSSL_EXPORT void SSL_enable_signed_cert_timestamps(SSL *ssl);
 
 /* SSL_CTX_enable_signed_cert_timestamps enables SCT requests on all client SSL
  * objects created from |ctx|.
@@ -2255,12 +2255,11 @@
 OPENSSL_EXPORT void SSL_CTX_enable_signed_cert_timestamps(SSL_CTX *ctx);
 
 /* SSL_enable_ocsp_stapling causes |ssl| (which must be the client end of a
- * connection) to request a stapled OCSP response from the server. It returns
- * one.
+ * connection) to request a stapled OCSP response from the server.
  *
  * Call |SSL_get0_ocsp_response| to recover the OCSP response after the
  * handshake. */
-OPENSSL_EXPORT int SSL_enable_ocsp_stapling(SSL *ssl);
+OPENSSL_EXPORT void SSL_enable_ocsp_stapling(SSL *ssl);
 
 /* SSL_CTX_enable_ocsp_stapling enables OCSP stapling on all client SSL objects
  * created from |ctx|.
@@ -3043,7 +3042,6 @@
 #define SSL_ST_OK 0x03
 #define SSL_ST_RENEGOTIATE (0x04 | SSL_ST_INIT)
 #define SSL_ST_TLS13 (0x05 | SSL_ST_INIT)
-#define SSL_ST_ERROR (0x06| SSL_ST_INIT)
 
 /* SSL_CB_* are possible values for the |type| parameter in the info
  * callback and the bitmasks that make them up. */
@@ -3086,8 +3084,7 @@
  *
  * |SSL_CB_ACCEPT_LOOP| (respectively, |SSL_CB_CONNECT_LOOP|) is signaled when
  * a server (respectively, client) handshake progresses. The |value| argument
- * is always one. For the duration of the callback, |SSL_state| will return the
- * previous state.
+ * is always one.
  *
  * |SSL_CB_ACCEPT_EXIT| (respectively, |SSL_CB_CONNECT_EXIT|) is signaled when
  * a server (respectively, client) handshake completes, fails, or is paused.
@@ -3589,7 +3586,10 @@
 
 typedef struct ssl_conf_ctx_st SSL_CONF_CTX;
 
-/* SSL_state returns the current state of the handshake state machine. */
+/* SSL_state returns |SSL_ST_INIT| if a handshake is in progress and |SSL_ST_OK|
+ * otherwise.
+ *
+ * Use |SSL_is_init| instead. */
 OPENSSL_EXPORT int SSL_state(const SSL *ssl);
 
 #define SSL_get_state(ssl) SSL_state(ssl)
@@ -3805,6 +3805,12 @@
    * early data. If zero, 0-RTT is disallowed. */
   uint32_t ticket_max_early_data;
 
+  /* early_alpn is the ALPN protocol from the initial handshake. This is only
+   * stored for TLS 1.3 and above in order to enforce ALPN matching for 0-RTT
+   * resumptions. */
+  uint8_t *early_alpn;
+  size_t early_alpn_len;
+
   /* extended_master_secret is true if the master secret in this session was
    * generated using EMS and thus isn't vulnerable to the Triple Handshake
    * attack. */
@@ -3965,8 +3971,6 @@
   void *msg_callback_arg;
 
   int verify_mode;
-  uint8_t sid_ctx_length;
-  uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
   int (*default_verify_callback)(
       int ok, X509_STORE_CTX *ctx); /* called 'verify_callback' in the SSL */
 
@@ -4061,12 +4065,6 @@
   /* The client's Channel ID private key. */
   EVP_PKEY *tlsext_channel_id_private;
 
-  /* Signed certificate timestamp list to be sent to the client, if requested */
-  CRYPTO_BUFFER *signed_cert_timestamp_list;
-
-  /* OCSP response to be sent to the client, if requested. */
-  CRYPTO_BUFFER *ocsp_response;
-
   /* keylog_callback, if not NULL, is the key logging callback. See
    * |SSL_CTX_set_keylog_callback|. */
   void (*keylog_callback)(const SSL *ssl, const char *line);
@@ -4107,9 +4105,6 @@
   /* short_header_enabled is one if a short record header in TLS 1.3 may
    * be negotiated and zero otherwise. */
   unsigned short_header_enabled:1;
-
-  /* TODO(agl): remove once node.js no longer references this. */
-  int freelist_max_len;
 };
 
 
diff --git a/src/include/openssl/ssl3.h b/src/include/openssl/ssl3.h
index 6a03d1b..fcaeb2d 100644
--- a/src/include/openssl/ssl3.h
+++ b/src/include/openssl/ssl3.h
@@ -307,6 +307,7 @@
 #define SSL3_ST_CW_FLUSH (0x100 | SSL_ST_CONNECT)
 #define SSL3_ST_FALSE_START (0x101 | SSL_ST_CONNECT)
 #define SSL3_ST_VERIFY_SERVER_CERT (0x102 | SSL_ST_CONNECT)
+#define SSL3_ST_FINISH_CLIENT_HANDSHAKE (0x103 | SSL_ST_CONNECT)
 /* write to server */
 #define SSL3_ST_CW_CLNT_HELLO_A (0x110 | SSL_ST_CONNECT)
 /* read from server */
diff --git a/src/include/openssl/time_support.h b/src/include/openssl/time_support.h
deleted file mode 100644
index 274b17d..0000000
--- a/src/include/openssl/time_support.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Written by Richard Levitte (richard@levitte.org) for the OpenSSL
- * project 2001.
- * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
- * project 2008.
- */
-/* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com). */
-
-#ifndef OPENSSL_HEADER_TIME_SUPPORT_H
-#define OPENSSL_HEADER_TIME_SUPPORT_H
-
-#include <openssl/base.h>
-
-#include <time.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-/* Wrapper functions for time functions. */
-
-
-/* OPENSSL_gmtime wraps |gmtime_r|. See the manual page for that function. */
-struct tm *OPENSSL_gmtime(const time_t *timer, struct tm *result);
-
-/* OPENSSL_gmtime_adj updates |tm| by adding |offset_day| days and |offset_sec|
- * seconds. */
-int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec);
-
-/* OPENSSL_gmtime_diff calculates the difference between |from| and |to| and
- * outputs the difference as a number of days and seconds in |*out_days| and
- * |*out_secs|. */
-int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from,
-                        const struct tm *to);
-
-
-#if defined(__cplusplus)
-}  /* extern C */
-#endif
-
-#endif  /* OPENSSL_HEADER_TIME_SUPPORT_H */
diff --git a/src/ssl/handshake_client.c b/src/ssl/handshake_client.c
index 427213c..c4f5e8e 100644
--- a/src/ssl/handshake_client.c
+++ b/src/ssl/handshake_client.c
@@ -190,21 +190,15 @@
 int ssl3_connect(SSL_HANDSHAKE *hs) {
   SSL *const ssl = hs->ssl;
   int ret = -1;
-  int state, skip = 0;
 
   assert(ssl->handshake_func == ssl3_connect);
   assert(!ssl->server);
 
   for (;;) {
-    state = hs->state;
+    int state = hs->state;
 
     switch (hs->state) {
       case SSL_ST_INIT:
-        hs->state = SSL_ST_CONNECT;
-        skip = 1;
-        break;
-
-      case SSL_ST_CONNECT:
         ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_START, 1);
         hs->state = SSL3_ST_CW_CLNT_HELLO_A;
         break;
@@ -254,13 +248,11 @@
         break;
 
       case SSL3_ST_CR_CERT_A:
-        if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+        if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
           ret = ssl3_get_server_certificate(hs);
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CR_CERT_STATUS_A;
         break;
@@ -271,20 +263,16 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_VERIFY_SERVER_CERT;
         break;
 
       case SSL3_ST_VERIFY_SERVER_CERT:
-        if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+        if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
           ret = ssl3_verify_server_cert(hs);
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CR_KEY_EXCH_A;
         break;
@@ -298,13 +286,11 @@
         break;
 
       case SSL3_ST_CR_CERT_REQ_A:
-        if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+        if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
           ret = ssl3_get_certificate_request(hs);
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CR_SRVR_DONE_A;
         break;
@@ -324,8 +310,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CW_KEY_EXCH_A;
         break;
@@ -345,8 +329,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CW_CHANGE;
         break;
@@ -367,8 +349,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CW_CHANNEL_ID_A;
         break;
@@ -379,8 +359,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CW_FINISHED_A;
         break;
@@ -393,7 +371,7 @@
         hs->state = SSL3_ST_CW_FLUSH;
 
         if (ssl->session != NULL) {
-          hs->next_state = SSL_ST_OK;
+          hs->next_state = SSL3_ST_FINISH_CLIENT_HANDSHAKE;
         } else {
           /* This is a non-resumption handshake. If it involves ChannelID, then
            * record the handshake hashes at this point in the session so that
@@ -427,8 +405,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_CR_CHANGE;
         break;
@@ -456,7 +432,7 @@
         if (ssl->session != NULL) {
           hs->state = SSL3_ST_CW_CHANGE;
         } else {
-          hs->state = SSL_ST_OK;
+          hs->state = SSL3_ST_FINISH_CLIENT_HANDSHAKE;
         }
         break;
 
@@ -466,7 +442,7 @@
           goto end;
         }
         hs->state = hs->next_state;
-        if (hs->state != SSL_ST_OK) {
+        if (hs->state != SSL3_ST_FINISH_CLIENT_HANDSHAKE) {
           ssl->method->expect_flight(ssl);
         }
         break;
@@ -476,10 +452,10 @@
         if (ret <= 0) {
           goto end;
         }
-        hs->state = SSL_ST_OK;
+        hs->state = SSL3_ST_FINISH_CLIENT_HANDSHAKE;
         break;
 
-      case SSL_ST_OK:
+      case SSL3_ST_FINISH_CLIENT_HANDSHAKE:
         ssl->method->release_current_message(ssl, 1 /* free_buffer */);
 
         SSL_SESSION_free(ssl->s3->established_session);
@@ -491,21 +467,21 @@
            * of the new established_session due to False Start. The caller may
            * have taken a reference to the temporary session. */
           ssl->s3->established_session =
-              SSL_SESSION_dup(ssl->s3->new_session, SSL_SESSION_DUP_ALL);
+              SSL_SESSION_dup(hs->new_session, SSL_SESSION_DUP_ALL);
           if (ssl->s3->established_session == NULL) {
-            /* Do not stay in SSL_ST_OK, to avoid confusing |SSL_in_init|
-             * callers. */
-            hs->state = SSL_ST_ERROR;
-            skip = 1;
             ret = -1;
             goto end;
           }
           ssl->s3->established_session->not_resumable = 0;
 
-          SSL_SESSION_free(ssl->s3->new_session);
-          ssl->s3->new_session = NULL;
+          SSL_SESSION_free(hs->new_session);
+          hs->new_session = NULL;
         }
 
+        hs->state = SSL_ST_OK;
+        break;
+
+      case SSL_ST_OK: {
         const int is_initial_handshake = !ssl->s3->initial_handshake_complete;
         ssl->s3->initial_handshake_complete = 1;
         if (is_initial_handshake) {
@@ -516,11 +492,7 @@
         ret = 1;
         ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_DONE, 1);
         goto end;
-
-      case SSL_ST_ERROR:
-        OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_HANDSHAKE_FAILURE);
-        ret = -1;
-        goto end;
+      }
 
       default:
         OPENSSL_PUT_ERROR(SSL, SSL_R_UNKNOWN_STATE);
@@ -528,13 +500,9 @@
         goto end;
     }
 
-    if (!ssl->s3->tmp.reuse_message && !skip && hs->state != state) {
-      int new_state = hs->state;
-      hs->state = state;
+    if (hs->state != state) {
       ssl_do_info_callback(ssl, SSL_CB_CONNECT_LOOP, 1);
-      hs->state = new_state;
     }
-    skip = 0;
   }
 
 end:
@@ -944,9 +912,9 @@
       goto f_err;
     }
     /* Note: session_id could be empty. */
-    ssl->s3->new_session->session_id_length = CBS_len(&session_id);
-    OPENSSL_memcpy(ssl->s3->new_session->session_id, CBS_data(&session_id),
-           CBS_len(&session_id));
+    hs->new_session->session_id_length = CBS_len(&session_id);
+    OPENSSL_memcpy(hs->new_session->session_id, CBS_data(&session_id),
+                   CBS_len(&session_id));
   }
 
   const SSL_CIPHER *c = SSL_get_cipher_by_value(cipher_suite);
@@ -988,9 +956,9 @@
       goto f_err;
     }
   } else {
-    ssl->s3->new_session->cipher = c;
+    hs->new_session->cipher = c;
   }
-  ssl->s3->tmp.new_cipher = c;
+  hs->new_cipher = c;
 
   /* Now that the cipher is known, initialize the handshake hash and hash the
    * ServerHello. */
@@ -1004,7 +972,7 @@
    * which requires hashing the handshake transcript. Otherwise, the handshake
    * buffer may be released. */
   if (ssl->session != NULL ||
-      !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+      !ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
     SSL_TRANSCRIPT_free_buffer(&hs->transcript);
   }
 
@@ -1030,8 +998,7 @@
   }
 
   if (ssl->session != NULL &&
-      ssl->s3->tmp.extended_master_secret !=
-          ssl->session->extended_master_secret) {
+      hs->extended_master_secret != ssl->session->extended_master_secret) {
     al = SSL_AD_HANDSHAKE_FAILURE;
     if (ssl->session->extended_master_secret) {
       OPENSSL_PUT_ERROR(SSL, SSL_R_RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION);
@@ -1065,27 +1032,27 @@
   CBS_init(&cbs, ssl->init_msg, ssl->init_num);
 
   uint8_t alert = SSL_AD_DECODE_ERROR;
-  sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free);
+  sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
   EVP_PKEY_free(hs->peer_pubkey);
   hs->peer_pubkey = NULL;
-  ssl->s3->new_session->certs = ssl_parse_cert_chain(
-      &alert, &hs->peer_pubkey, NULL, &cbs, ssl->ctx->pool);
-  if (ssl->s3->new_session->certs == NULL) {
+  hs->new_session->certs = ssl_parse_cert_chain(&alert, &hs->peer_pubkey, NULL,
+                                                &cbs, ssl->ctx->pool);
+  if (hs->new_session->certs == NULL) {
     ssl3_send_alert(ssl, SSL3_AL_FATAL, alert);
     return -1;
   }
 
-  if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0 ||
+  if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0 ||
       CBS_len(&cbs) != 0 ||
-      !ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) {
+      !ssl->ctx->x509_method->session_cache_objects(hs->new_session)) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR);
     return -1;
   }
 
   if (!ssl_check_leaf_certificate(
-          ssl, hs->peer_pubkey,
-          sk_CRYPTO_BUFFER_value(ssl->s3->new_session->certs, 0))) {
+          hs, hs->peer_pubkey,
+          sk_CRYPTO_BUFFER_value(hs->new_session->certs, 0))) {
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER);
     return -1;
   }
@@ -1126,8 +1093,8 @@
     goto f_err;
   }
 
-  if (!CBS_stow(&ocsp_response, &ssl->s3->new_session->ocsp_response,
-                &ssl->s3->new_session->ocsp_response_length)) {
+  if (!CBS_stow(&ocsp_response, &hs->new_session->ocsp_response,
+                &hs->new_session->ocsp_response_length)) {
     al = SSL_AD_INTERNAL_ERROR;
     OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
     goto f_err;
@@ -1141,8 +1108,8 @@
 
 static int ssl3_verify_server_cert(SSL_HANDSHAKE *hs) {
   SSL *const ssl = hs->ssl;
-  if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result,
-                             ssl->s3->new_session->x509_chain)) {
+  if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result,
+                             hs->new_session->x509_chain)) {
     return -1;
   }
 
@@ -1163,7 +1130,7 @@
 
   if (ssl->s3->tmp.message_type != SSL3_MT_SERVER_KEY_EXCHANGE) {
     /* Some ciphers (pure PSK) have an optional ServerKeyExchange message. */
-    if (ssl_cipher_requires_server_key_exchange(ssl->s3->tmp.new_cipher)) {
+    if (ssl_cipher_requires_server_key_exchange(hs->new_cipher)) {
       OPENSSL_PUT_ERROR(SSL, SSL_R_UNEXPECTED_MESSAGE);
       ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_UNEXPECTED_MESSAGE);
       return -1;
@@ -1182,8 +1149,8 @@
   CBS_init(&server_key_exchange, ssl->init_msg, ssl->init_num);
   CBS server_key_exchange_orig = server_key_exchange;
 
-  uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
-  uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+  uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+  uint32_t alg_a = hs->new_cipher->algorithm_auth;
 
   if (alg_a & SSL_aPSK) {
     CBS psk_identity_hint;
@@ -1279,7 +1246,7 @@
       OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
       goto f_err;
     }
-    ssl->s3->new_session->group_id = group_id;
+    hs->new_session->group_id = group_id;
 
     /* Ensure the group is consistent with preferences. */
     if (!tls1_check_group_id(ssl, group_id)) {
@@ -1307,7 +1274,7 @@
            CBS_len(&server_key_exchange_orig) - CBS_len(&server_key_exchange));
 
   /* ServerKeyExchange should be signed by the server's public key. */
-  if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+  if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
     uint16_t signature_algorithm = 0;
     if (ssl3_protocol_version(ssl) >= TLS1_2_VERSION) {
       if (!CBS_get_u16(&server_key_exchange, &signature_algorithm)) {
@@ -1318,7 +1285,7 @@
       if (!tls12_check_peer_sigalg(ssl, &al, signature_algorithm)) {
         goto f_err;
       }
-      ssl->s3->new_session->peer_signature_algorithm = signature_algorithm;
+      hs->new_session->peer_signature_algorithm = signature_algorithm;
     } else if (hs->peer_pubkey->type == EVP_PKEY_RSA) {
       signature_algorithm = SSL_SIGN_RSA_PKCS1_MD5_SHA1;
     } else if (hs->peer_pubkey->type == EVP_PKEY_EC) {
@@ -1527,8 +1494,8 @@
     goto err;
   }
 
-  uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
-  uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+  uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+  uint32_t alg_a = hs->new_cipher->algorithm_auth;
 
   /* If using a PSK key exchange, prepare the pre-shared key. */
   unsigned psk_len = 0;
@@ -1551,9 +1518,9 @@
     }
     assert(psk_len <= PSK_MAX_PSK_LEN);
 
-    OPENSSL_free(ssl->s3->new_session->psk_identity);
-    ssl->s3->new_session->psk_identity = BUF_strdup(identity);
-    if (ssl->s3->new_session->psk_identity == NULL) {
+    OPENSSL_free(hs->new_session->psk_identity);
+    hs->new_session->psk_identity = BUF_strdup(identity);
+    if (hs->new_session->psk_identity == NULL) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
       goto err;
     }
@@ -1676,13 +1643,12 @@
     goto err;
   }
 
-  ssl->s3->new_session->master_key_length = tls1_generate_master_secret(
-      hs, ssl->s3->new_session->master_key, pms, pms_len);
-  if (ssl->s3->new_session->master_key_length == 0) {
+  hs->new_session->master_key_length = tls1_generate_master_secret(
+      hs, hs->new_session->master_key, pms, pms_len);
+  if (hs->new_session->master_key_length == 0) {
     goto err;
   }
-  ssl->s3->new_session->extended_master_secret =
-      ssl->s3->tmp.extended_master_secret;
+  hs->new_session->extended_master_secret = hs->extended_master_secret;
   OPENSSL_cleanse(pms, pms_len);
   OPENSSL_free(pms);
 
@@ -1740,9 +1706,9 @@
 
       uint8_t digest[EVP_MAX_MD_SIZE];
       size_t digest_len;
-      if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(
-              &hs->transcript, digest, &digest_len, ssl->s3->new_session,
-              signature_algorithm)) {
+      if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(&hs->transcript, digest,
+                                                &digest_len, hs->new_session,
+                                                signature_algorithm)) {
         goto err;
       }
 
@@ -1870,7 +1836,7 @@
   }
 
   int session_renewed = ssl->session != NULL;
-  SSL_SESSION *session = ssl->s3->new_session;
+  SSL_SESSION *session = hs->new_session;
   if (session_renewed) {
     /* The server is sending a new ticket for an existing session. Sessions are
      * immutable once established, so duplicate all but the ticket of the
diff --git a/src/ssl/handshake_server.c b/src/ssl/handshake_server.c
index c352dd9..51338e2 100644
--- a/src/ssl/handshake_server.c
+++ b/src/ssl/handshake_server.c
@@ -202,21 +202,15 @@
   SSL *const ssl = hs->ssl;
   uint32_t alg_a;
   int ret = -1;
-  int state, skip = 0;
 
   assert(ssl->handshake_func == ssl3_accept);
   assert(ssl->server);
 
   for (;;) {
-    state = hs->state;
+    int state = hs->state;
 
     switch (hs->state) {
       case SSL_ST_INIT:
-        hs->state = SSL_ST_ACCEPT;
-        skip = 1;
-        break;
-
-      case SSL_ST_ACCEPT:
         ssl_do_info_callback(ssl, SSL_CB_HANDSHAKE_START, 1);
         hs->state = SSL3_ST_SR_CLNT_HELLO_A;
         break;
@@ -269,13 +263,11 @@
         break;
 
       case SSL3_ST_SW_CERT_A:
-        if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+        if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
           ret = ssl3_send_server_certificate(hs);
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_SW_CERT_STATUS_A;
         break;
@@ -286,25 +278,21 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_SW_KEY_EXCH_A;
         break;
 
       case SSL3_ST_SW_KEY_EXCH_A:
       case SSL3_ST_SW_KEY_EXCH_B:
-        alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+        alg_a = hs->new_cipher->algorithm_auth;
 
         /* PSK ciphers send ServerKeyExchange if there is an identity hint. */
-        if (ssl_cipher_requires_server_key_exchange(ssl->s3->tmp.new_cipher) ||
+        if (ssl_cipher_requires_server_key_exchange(hs->new_cipher) ||
             ((alg_a & SSL_aPSK) && ssl->psk_identity_hint)) {
           ret = ssl3_send_server_key_exchange(hs);
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
 
         hs->state = SSL3_ST_SW_CERT_REQ_A;
@@ -316,8 +304,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_SW_SRVR_DONE_A;
         break;
@@ -379,8 +365,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_SR_CHANNEL_ID_A;
         break;
@@ -391,8 +375,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_SR_FINISHED_A;
         break;
@@ -411,7 +393,7 @@
         }
 
         /* If this is a full handshake with ChannelID then record the handshake
-         * hashes in |ssl->s3->new_session| in case we need them to verify a
+         * hashes in |hs->new_session| in case we need them to verify a
          * ChannelID signature on a resumption of this session in the future. */
         if (ssl->session == NULL && ssl->s3->tlsext_channel_id_valid) {
           ret = tls1_record_handshake_hashes_for_channel_id(hs);
@@ -427,8 +409,6 @@
           if (ret <= 0) {
             goto end;
           }
-        } else {
-          skip = 1;
         }
         hs->state = SSL3_ST_SW_CHANGE;
         break;
@@ -481,12 +461,11 @@
 
         /* If we aren't retaining peer certificates then we can discard it
          * now. */
-        if (ssl->s3->new_session != NULL &&
+        if (hs->new_session != NULL &&
             ssl->retain_only_sha256_of_client_certs) {
-          sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs,
-                                    CRYPTO_BUFFER_free);
-          ssl->s3->new_session->certs = NULL;
-          ssl->ctx->x509_method->session_clear(ssl->s3->new_session);
+          sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
+          hs->new_session->certs = NULL;
+          ssl->ctx->x509_method->session_clear(hs->new_session);
         }
 
         SSL_SESSION_free(ssl->s3->established_session);
@@ -494,9 +473,9 @@
           SSL_SESSION_up_ref(ssl->session);
           ssl->s3->established_session = ssl->session;
         } else {
-          ssl->s3->established_session = ssl->s3->new_session;
+          ssl->s3->established_session = hs->new_session;
           ssl->s3->established_session->not_resumable = 0;
-          ssl->s3->new_session = NULL;
+          hs->new_session = NULL;
         }
 
         if (hs->v2_clienthello) {
@@ -518,13 +497,9 @@
         goto end;
     }
 
-    if (!ssl->s3->tmp.reuse_message && !skip && hs->state != state) {
-      int new_state = hs->state;
-      hs->state = state;
+    if (hs->state != state) {
       ssl_do_info_callback(ssl, SSL_CB_ACCEPT_LOOP, 1);
-      hs->state = new_state;
     }
-    skip = 0;
   }
 
 end:
@@ -921,9 +896,9 @@
 
   /* Negotiate the cipher suite. This must be done after |cert_cb| so the
    * certificate is finalized. */
-  ssl->s3->tmp.new_cipher =
+  hs->new_cipher =
       ssl3_choose_cipher(hs, &client_hello, ssl_get_cipher_preferences(ssl));
-  if (ssl->s3->tmp.new_cipher == NULL) {
+  if (hs->new_cipher == NULL) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER);
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
     return -1;
@@ -958,8 +933,7 @@
   }
 
   if (session != NULL) {
-    if (session->extended_master_secret &&
-        !ssl->s3->tmp.extended_master_secret) {
+    if (session->extended_master_secret && !hs->extended_master_secret) {
       /* A ClientHello without EMS that attempts to resume a session with EMS
        * is fatal to the connection. */
       al = SSL_AD_HANDSHAKE_FAILURE;
@@ -967,11 +941,10 @@
       goto f_err;
     }
 
-    if (!ssl_session_is_resumable(ssl, session) ||
+    if (!ssl_session_is_resumable(hs, session) ||
         /* If the client offers the EMS extension, but the previous session
          * didn't use it, then negotiate a new session. */
-        ssl->s3->tmp.extended_master_secret !=
-            session->extended_master_secret) {
+        hs->extended_master_secret != session->extended_master_secret) {
       SSL_SESSION_free(session);
       session = NULL;
     }
@@ -992,7 +965,7 @@
 
     /* Clear the session ID if we want the session to be single-use. */
     if (!(ssl->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER)) {
-      ssl->s3->new_session->session_id_length = 0;
+      hs->new_session->session_id_length = 0;
     }
   }
 
@@ -1005,13 +978,13 @@
   }
 
   if (ssl->session == NULL) {
-    ssl->s3->new_session->cipher = ssl->s3->tmp.new_cipher;
+    hs->new_session->cipher = hs->new_cipher;
 
     /* On new sessions, stash the SNI value in the session. */
     if (hs->hostname != NULL) {
-      OPENSSL_free(ssl->s3->new_session->tlsext_hostname);
-      ssl->s3->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
-      if (ssl->s3->new_session->tlsext_hostname == NULL) {
+      OPENSSL_free(hs->new_session->tlsext_hostname);
+      hs->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
+      if (hs->new_session->tlsext_hostname == NULL) {
         al = SSL_AD_INTERNAL_ERROR;
         goto f_err;
       }
@@ -1025,14 +998,14 @@
       hs->cert_request = 0;
     }
     /* CertificateRequest may only be sent in certificate-based ciphers. */
-    if (!ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+    if (!ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
       hs->cert_request = 0;
     }
 
     if (!hs->cert_request) {
       /* OpenSSL returns X509_V_OK when no certificates are requested. This is
        * classed by them as a bug, but it's assumed by at least NGINX. */
-      ssl->s3->new_session->verify_result = X509_V_OK;
+      hs->new_session->verify_result = X509_V_OK;
     }
   }
 
@@ -1045,7 +1018,7 @@
   /* Now that all parameters are known, initialize the handshake hash and hash
    * the ClientHello. */
   if (!SSL_TRANSCRIPT_init_hash(&hs->transcript, ssl3_protocol_version(ssl),
-                                ssl->s3->tmp.new_cipher->algorithm_prf) ||
+                                hs->new_cipher->algorithm_prf) ||
       !ssl_hash_current_message(hs)) {
     goto f_err;
   }
@@ -1073,7 +1046,7 @@
   /* We only accept ChannelIDs on connections with ECDHE in order to avoid a
    * known attack while we fix ChannelID itself. */
   if (ssl->s3->tlsext_channel_id_valid &&
-      (ssl->s3->tmp.new_cipher->algorithm_mkey & SSL_kECDHE) == 0) {
+      (hs->new_cipher->algorithm_mkey & SSL_kECDHE) == 0) {
     ssl->s3->tlsext_channel_id_valid = 0;
   }
 
@@ -1098,7 +1071,7 @@
   /* TODO(davidben): Implement the TLS 1.1 and 1.2 downgrade sentinels once TLS
    * 1.3 is finalized and we are not implementing a draft version. */
 
-  const SSL_SESSION *session = ssl->s3->new_session;
+  const SSL_SESSION *session = hs->new_session;
   if (ssl->session != NULL) {
     session = ssl->session;
   }
@@ -1110,7 +1083,7 @@
       !CBB_add_u8_length_prefixed(&body, &session_id) ||
       !CBB_add_bytes(&session_id, session->session_id,
                      session->session_id_length) ||
-      !CBB_add_u16(&body, ssl_cipher_get_value(ssl->s3->tmp.new_cipher)) ||
+      !CBB_add_u16(&body, ssl_cipher_get_value(hs->new_cipher)) ||
       !CBB_add_u8(&body, 0 /* no compression */) ||
       !ssl_add_serverhello_tlsext(hs, &body) ||
       !ssl_add_message_cbb(ssl, &cbb)) {
@@ -1142,8 +1115,9 @@
                                  SSL3_MT_CERTIFICATE_STATUS) ||
       !CBB_add_u8(&body, TLSEXT_STATUSTYPE_ocsp) ||
       !CBB_add_u24_length_prefixed(&body, &ocsp_response) ||
-      !CBB_add_bytes(&ocsp_response, CRYPTO_BUFFER_data(ssl->ocsp_response),
-                     CRYPTO_BUFFER_len(ssl->ocsp_response)) ||
+      !CBB_add_bytes(&ocsp_response,
+                     CRYPTO_BUFFER_data(ssl->cert->ocsp_response),
+                     CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) ||
       !ssl_add_message_cbb(ssl, &cbb)) {
     OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
     CBB_cleanup(&cbb);
@@ -1160,8 +1134,8 @@
 
   /* Put together the parameters. */
   if (hs->state == SSL3_ST_SW_KEY_EXCH_A) {
-    uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
-    uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+    uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+    uint32_t alg_a = hs->new_cipher->algorithm_auth;
 
     /* Pre-allocate enough room to comfortably fit an ECDHE public key. */
     if (!CBB_init(&cbb, 128)) {
@@ -1214,7 +1188,7 @@
         ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
         goto err;
       }
-      ssl->s3->new_session->group_id = group_id;
+      hs->new_session->group_id = group_id;
 
       /* Set up ECDH, generate a key, and emit the public half. */
       if (!SSL_ECDH_CTX_init(&hs->ecdh_ctx, group_id) ||
@@ -1242,7 +1216,7 @@
   }
 
   /* Add a signature. */
-  if (ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+  if (ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
     if (!ssl_has_private_key(ssl)) {
       ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
       goto err;
@@ -1439,7 +1413,7 @@
 
       /* OpenSSL returns X509_V_OK when no certificates are received. This is
        * classed by them as a bug, but it's assumed by at least NGINX. */
-      ssl->s3->new_session->verify_result = X509_V_OK;
+      hs->new_session->verify_result = X509_V_OK;
       ssl->s3->tmp.reuse_message = 1;
       return 1;
     }
@@ -1456,29 +1430,28 @@
   CBS certificate_msg;
   CBS_init(&certificate_msg, ssl->init_msg, ssl->init_num);
 
-  sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free);
+  sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
   EVP_PKEY_free(hs->peer_pubkey);
   hs->peer_pubkey = NULL;
   uint8_t alert = SSL_AD_DECODE_ERROR;
-  ssl->s3->new_session->certs =
-      ssl_parse_cert_chain(&alert, &hs->peer_pubkey,
-                           ssl->retain_only_sha256_of_client_certs
-                               ? ssl->s3->new_session->peer_sha256
-                               : NULL,
-                           &certificate_msg, ssl->ctx->pool);
-  if (ssl->s3->new_session->certs == NULL) {
+  hs->new_session->certs = ssl_parse_cert_chain(
+      &alert, &hs->peer_pubkey,
+      ssl->retain_only_sha256_of_client_certs ? hs->new_session->peer_sha256
+                                              : NULL,
+      &certificate_msg, ssl->ctx->pool);
+  if (hs->new_session->certs == NULL) {
     ssl3_send_alert(ssl, SSL3_AL_FATAL, alert);
     return -1;
   }
 
   if (CBS_len(&certificate_msg) != 0 ||
-      !ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) {
+      !ssl->ctx->x509_method->session_cache_objects(hs->new_session)) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR);
     return -1;
   }
 
-  if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) {
+  if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) {
     /* No client certificate so the handshake buffer may be discarded. */
     SSL_TRANSCRIPT_free_buffer(&hs->transcript);
 
@@ -1499,17 +1472,17 @@
 
     /* OpenSSL returns X509_V_OK when no certificates are received. This is
      * classed by them as a bug, but it's assumed by at least NGINX. */
-    ssl->s3->new_session->verify_result = X509_V_OK;
+    hs->new_session->verify_result = X509_V_OK;
     return 1;
   }
 
   /* The hash will have been filled in. */
   if (ssl->retain_only_sha256_of_client_certs) {
-    ssl->s3->new_session->peer_sha256_valid = 1;
+    hs->new_session->peer_sha256_valid = 1;
   }
 
-  if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result,
-                             ssl->s3->new_session->x509_chain)) {
+  if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result,
+                             hs->new_session->x509_chain)) {
     return -1;
   }
   return 1;
@@ -1541,8 +1514,8 @@
   }
 
   CBS_init(&client_key_exchange, ssl->init_msg, ssl->init_num);
-  alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
-  alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+  alg_k = hs->new_cipher->algorithm_mkey;
+  alg_a = hs->new_cipher->algorithm_auth;
 
   /* If using a PSK key exchange, prepare the pre-shared key. */
   if (alg_a & SSL_aPSK) {
@@ -1570,15 +1543,15 @@
       goto f_err;
     }
 
-    if (!CBS_strdup(&psk_identity, &ssl->s3->new_session->psk_identity)) {
+    if (!CBS_strdup(&psk_identity, &hs->new_session->psk_identity)) {
       al = SSL_AD_INTERNAL_ERROR;
       OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
       goto f_err;
     }
 
     /* Look up the key for the identity. */
-    psk_len = ssl->psk_server_callback(ssl, ssl->s3->new_session->psk_identity,
-                                       psk, sizeof(psk));
+    psk_len = ssl->psk_server_callback(ssl, hs->new_session->psk_identity, psk,
+                                       sizeof(psk));
     if (psk_len > PSK_MAX_PSK_LEN) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
       al = SSL_AD_INTERNAL_ERROR;
@@ -1763,14 +1736,12 @@
   }
 
   /* Compute the master secret */
-  ssl->s3->new_session->master_key_length =
-      tls1_generate_master_secret(hs, ssl->s3->new_session->master_key,
-                                  premaster_secret, premaster_secret_len);
-  if (ssl->s3->new_session->master_key_length == 0) {
+  hs->new_session->master_key_length = tls1_generate_master_secret(
+      hs, hs->new_session->master_key, premaster_secret, premaster_secret_len);
+  if (hs->new_session->master_key_length == 0) {
     goto err;
   }
-  ssl->s3->new_session->extended_master_secret =
-      ssl->s3->tmp.extended_master_secret;
+  hs->new_session->extended_master_secret = hs->extended_master_secret;
 
   OPENSSL_cleanse(premaster_secret, premaster_secret_len);
   OPENSSL_free(premaster_secret);
@@ -1823,7 +1794,7 @@
     if (!tls12_check_peer_sigalg(ssl, &al, signature_algorithm)) {
       goto f_err;
     }
-    ssl->s3->new_session->peer_signature_algorithm = signature_algorithm;
+    hs->new_session->peer_signature_algorithm = signature_algorithm;
   } else if (hs->peer_pubkey->type == EVP_PKEY_RSA) {
     signature_algorithm = SSL_SIGN_RSA_PKCS1_MD5_SHA1;
   } else if (hs->peer_pubkey->type == EVP_PKEY_EC) {
@@ -1849,7 +1820,7 @@
     uint8_t digest[EVP_MAX_MD_SIZE];
     size_t digest_len;
     if (!SSL_TRANSCRIPT_ssl3_cert_verify_hash(&hs->transcript, digest,
-                                              &digest_len, ssl->s3->new_session,
+                                              &digest_len, hs->new_session,
                                               signature_algorithm)) {
       goto err;
     }
@@ -1946,8 +1917,8 @@
   SSL_SESSION *session_copy = NULL;
   if (ssl->session == NULL) {
     /* Fix the timeout to measure from the ticket issuance time. */
-    ssl_session_rebase_time(ssl, ssl->s3->new_session);
-    session = ssl->s3->new_session;
+    ssl_session_rebase_time(ssl, hs->new_session);
+    session = hs->new_session;
   } else {
     /* We are renewing an existing session. Duplicate the session to adjust the
      * timeout. */
diff --git a/src/ssl/internal.h b/src/ssl/internal.h
index 5b93f47..b2c9fcd 100644
--- a/src/ssl/internal.h
+++ b/src/ssl/internal.h
@@ -854,9 +854,9 @@
 int ssl_add_client_CA_list(SSL *ssl, CBB *cbb);
 
 /* ssl_check_leaf_certificate returns one if |pkey| and |leaf| are suitable as
- * a server's leaf certificate for |ssl|. Otherwise, it returns zero and pushes
+ * a server's leaf certificate for |hs|. Otherwise, it returns zero and pushes
  * an error on the error queue. */
-int ssl_check_leaf_certificate(SSL *ssl, EVP_PKEY *pkey,
+int ssl_check_leaf_certificate(SSL_HANDSHAKE *hs, EVP_PKEY *pkey,
                                const CRYPTO_BUFFER *leaf);
 
 
@@ -1049,6 +1049,13 @@
   /* peer_pubkey is the public key parsed from the peer's leaf certificate. */
   EVP_PKEY *peer_pubkey;
 
+  /* new_session is the new mutable session being established by the current
+   * handshake. It should not be cached. */
+  SSL_SESSION *new_session;
+
+  /* new_cipher is the cipher being negotiated in this handshake. */
+  const SSL_CIPHER *new_cipher;
+
   /* key_block is the record-layer key block for TLS 1.2 and earlier. */
   uint8_t *key_block;
   uint8_t key_block_len;
@@ -1100,6 +1107,10 @@
   /* v2_clienthello is one if we received a V2ClientHello. */
   unsigned v2_clienthello:1;
 
+  /* extended_master_secret is one if the extended master secret extension is
+   * negotiated in this handshake. */
+  unsigned extended_master_secret:1;
+
   /* client_version is the value sent or received in the ClientHello version. */
   uint16_t client_version;
 } /* SSL_HANDSHAKE */;
@@ -1323,6 +1334,17 @@
   /* Optional X509_STORE for certificate validation. If NULL the parent SSL_CTX
    * store is used instead. */
   X509_STORE *verify_store;
+
+  /* Signed certificate timestamp list to be sent to the client, if requested */
+  CRYPTO_BUFFER *signed_cert_timestamp_list;
+
+  /* OCSP response to be sent to the client, if requested. */
+  CRYPTO_BUFFER *ocsp_response;
+
+  /* sid_ctx partitions the session space within a shared session cache or
+   * ticket key. Only sessions with a matching value will be accepted. */
+  uint8_t sid_ctx_length;
+  uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
 } CERT;
 
 /* SSL_METHOD is a compatibility structure to support the legacy version-locked
@@ -1594,9 +1616,6 @@
    * TODO(davidben): Move everything not needed after the handshake completes to
    * |hs| and remove this. */
   struct {
-    /* used to hold the new cipher we are going to use */
-    const SSL_CIPHER *new_cipher;
-
     int message_type;
 
     int reuse_message;
@@ -1604,20 +1623,8 @@
     uint8_t new_mac_secret_len;
     uint8_t new_key_len;
     uint8_t new_fixed_iv_len;
-
-    /* extended_master_secret indicates whether the extended master secret
-     * computation is used in this handshake. Note that this is different from
-     * whether it was used for the current session. If this is a resumption
-     * handshake then EMS might be negotiated in the client and server hello
-     * messages, but it doesn't matter if the session that's being resumed
-     * didn't use it to create the master secret initially. */
-    char extended_master_secret;
   } tmp;
 
-  /* new_session is the new mutable session being established by the current
-   * handshake. It should not be cached. */
-  SSL_SESSION *new_session;
-
   /* established_session is the session established by the connection. This
    * session is only filled upon the completion of the handshake and is
    * immutable. */
@@ -1798,11 +1805,6 @@
    * milliseconds. It's used to initialize the timer any time it's restarted. */
   unsigned initial_timeout_duration_ms;
 
-  /* the session_id_context is used to ensure sessions are only reused
-   * in the appropriate context */
-  uint8_t sid_ctx_length;
-  uint8_t sid_ctx[SSL_MAX_SID_CTX_LENGTH];
-
   /* session is the configured session to be offered by the client. This session
    * is immutable. */
   SSL_SESSION *session;
@@ -1887,12 +1889,6 @@
    * hash of the peer's certificate and then discard it to save memory and
    * session space. Only effective on the server side. */
   unsigned retain_only_sha256_of_client_certs:1;
-
-  /* Signed certificate timestamp list to be sent to the client, if requested */
-  CRYPTO_BUFFER *signed_cert_timestamp_list;
-
-  /* OCSP response to be sent to the client, if requested. */
-  CRYPTO_BUFFER *ocsp_response;
 };
 
 /* From draft-ietf-tls-tls13-18, used in determining PSK modes. */
@@ -1936,9 +1932,10 @@
  * it has expired. */
 int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session);
 
-/* ssl_session_is_resumable returns one if |session| is resumable for |ssl| and
+/* ssl_session_is_resumable returns one if |session| is resumable for |hs| and
  * zero otherwise. */
-int ssl_session_is_resumable(const SSL *ssl, const SSL_SESSION *session);
+int ssl_session_is_resumable(const SSL_HANDSHAKE *hs,
+                             const SSL_SESSION *session);
 
 /* SSL_SESSION_get_digest returns the digest used in |session|. If the digest is
  * invalid, it returns NULL. */
diff --git a/src/ssl/s3_both.c b/src/ssl/s3_both.c
index d3f9421..7fd09c6 100644
--- a/src/ssl/s3_both.c
+++ b/src/ssl/s3_both.c
@@ -167,6 +167,7 @@
   OPENSSL_free(hs->cookie);
   OPENSSL_free(hs->key_share_bytes);
   OPENSSL_free(hs->public_key);
+  SSL_SESSION_free(hs->new_session);
   OPENSSL_free(hs->peer_sigalgs);
   OPENSSL_free(hs->peer_supported_group_list);
   OPENSSL_free(hs->peer_key);
@@ -678,7 +679,6 @@
 }
 
 int ssl3_get_message(SSL *ssl) {
-again:
   /* Re-create the handshake buffer if needed. */
   if (ssl->init_buf == NULL) {
     ssl->init_buf = BUF_MEM_new();
@@ -733,16 +733,6 @@
   ssl->s3->tmp.message_type = ((const uint8_t *)ssl->init_buf->data)[0];
   ssl->init_msg = (uint8_t*)ssl->init_buf->data + SSL3_HM_HEADER_LENGTH;
   ssl->init_num = ssl->init_buf->length - SSL3_HM_HEADER_LENGTH;
-
-  /* Ignore stray HelloRequest messages in the handshake before TLS 1.3. Per RFC
-   * 5246, section 7.4.1.1, the server may send HelloRequest at any time. */
-  if (!ssl->server && SSL_in_init(ssl) &&
-      (!ssl->s3->have_version || ssl3_protocol_version(ssl) < TLS1_3_VERSION) &&
-      ssl->s3->tmp.message_type == SSL3_MT_HELLO_REQUEST &&
-      ssl->init_num == 0) {
-    goto again;
-  }
-
   return 1;
 }
 
diff --git a/src/ssl/s3_lib.c b/src/ssl/s3_lib.c
index 1c723cd..57a27c7 100644
--- a/src/ssl/s3_lib.c
+++ b/src/ssl/s3_lib.c
@@ -197,7 +197,6 @@
   ssl_read_buffer_clear(ssl);
   ssl_write_buffer_clear(ssl);
 
-  SSL_SESSION_free(ssl->s3->new_session);
   SSL_SESSION_free(ssl->s3->established_session);
   ssl_handshake_free(ssl->s3->hs);
   OPENSSL_free(ssl->s3->next_proto_negotiated);
diff --git a/src/ssl/ssl_asn1.c b/src/ssl/ssl_asn1.c
index 3582864..3533225 100644
--- a/src/ssl/ssl_asn1.c
+++ b/src/ssl/ssl_asn1.c
@@ -130,6 +130,7 @@
  *     peerSignatureAlgorithm  [23] INTEGER OPTIONAL,
  *     ticketMaxEarlyData      [24] INTEGER OPTIONAL,
  *     authTimeout             [25] INTEGER OPTIONAL, -- defaults to timeout
+ *     earlyALPN               [26] OCTET STRING OPTIONAL,
  * }
  *
  * Note: historically this serialization has included other optional
@@ -186,6 +187,8 @@
     CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 24;
 static const int kAuthTimeoutTag =
     CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 25;
+static const int kEarlyALPNTag =
+    CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 26;
 
 static int SSL_SESSION_to_bytes_full(const SSL_SESSION *in, uint8_t **out_data,
                                      size_t *out_len, int for_ticket) {
@@ -412,6 +415,16 @@
     goto err;
   }
 
+  if (in->early_alpn) {
+    if (!CBB_add_asn1(&session, &child, kEarlyALPNTag) ||
+        !CBB_add_asn1(&child, &child2, CBS_ASN1_OCTETSTRING) ||
+        !CBB_add_bytes(&child2, (const uint8_t *)in->early_alpn,
+                       in->early_alpn_len)) {
+      OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+      goto err;
+    }
+  }
+
   if (!CBB_finish(&cbb, out_data, out_len)) {
     OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
     goto err;
@@ -800,6 +813,8 @@
                              kTicketMaxEarlyDataTag, 0) ||
       !SSL_SESSION_parse_long(&session, &ret->auth_timeout, kAuthTimeoutTag,
                               ret->timeout) ||
+      !SSL_SESSION_parse_octet_string(&session, &ret->early_alpn,
+                                      &ret->early_alpn_len, kEarlyALPNTag) ||
       CBS_len(&session) != 0) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SSL_SESSION);
     goto err;
diff --git a/src/ssl/ssl_cert.c b/src/ssl/ssl_cert.c
index 4177a48..c60c6fa 100644
--- a/src/ssl/ssl_cert.c
+++ b/src/ssl/ssl_cert.c
@@ -203,6 +203,19 @@
     ret->verify_store = cert->verify_store;
   }
 
+  if (cert->signed_cert_timestamp_list != NULL) {
+    CRYPTO_BUFFER_up_ref(cert->signed_cert_timestamp_list);
+    ret->signed_cert_timestamp_list = cert->signed_cert_timestamp_list;
+  }
+
+  if (cert->ocsp_response != NULL) {
+    CRYPTO_BUFFER_up_ref(cert->ocsp_response);
+    ret->ocsp_response = cert->ocsp_response;
+  }
+
+  ret->sid_ctx_length = cert->sid_ctx_length;
+  OPENSSL_memcpy(ret->sid_ctx, cert->sid_ctx, sizeof(ret->sid_ctx));
+
   return ret;
 
 err:
@@ -235,6 +248,8 @@
   ssl_cert_clear_certs(c);
   OPENSSL_free(c->sigalgs);
   X509_STORE_free(c->verify_store);
+  CRYPTO_BUFFER_free(c->signed_cert_timestamp_list);
+  CRYPTO_BUFFER_free(c->ocsp_response);
 
   OPENSSL_free(c);
 }
@@ -883,20 +898,20 @@
   ssl_cert_set_cert_cb(ssl->cert, cb, arg);
 }
 
-int ssl_check_leaf_certificate(SSL *ssl, EVP_PKEY *pkey,
+int ssl_check_leaf_certificate(SSL_HANDSHAKE *hs, EVP_PKEY *pkey,
                                const CRYPTO_BUFFER *leaf) {
+  SSL *const ssl = hs->ssl;
   assert(ssl3_protocol_version(ssl) < TLS1_3_VERSION);
 
   /* Check the certificate's type matches the cipher. */
-  const SSL_CIPHER *cipher = ssl->s3->tmp.new_cipher;
-  int expected_type = ssl_cipher_get_key_type(cipher);
+  int expected_type = ssl_cipher_get_key_type(hs->new_cipher);
   assert(expected_type != EVP_PKEY_NONE);
   if (pkey->type != expected_type) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CERTIFICATE_TYPE);
     return 0;
   }
 
-  if (cipher->algorithm_auth & SSL_aECDSA) {
+  if (hs->new_cipher->algorithm_auth & SSL_aECDSA) {
     CBS leaf_cbs;
     CBS_init(&leaf_cbs, CRYPTO_BUFFER_data(leaf), CRYPTO_BUFFER_len(leaf));
     /* ECDSA and ECDH certificates use the same public key format. Instead,
@@ -956,3 +971,42 @@
   SSL_CTX_set_cert_cb(ctx, do_client_cert_cb, NULL);
   ctx->client_cert_cb = cb;
 }
+
+static int set_signed_cert_timestamp_list(CERT *cert, const uint8_t *list,
+                                           size_t list_len) {
+  CBS sct_list;
+  CBS_init(&sct_list, list, list_len);
+  if (!ssl_is_sct_list_valid(&sct_list)) {
+    OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST);
+    return 0;
+  }
+
+  CRYPTO_BUFFER_free(cert->signed_cert_timestamp_list);
+  cert->signed_cert_timestamp_list =
+      CRYPTO_BUFFER_new(CBS_data(&sct_list), CBS_len(&sct_list), NULL);
+  return cert->signed_cert_timestamp_list != NULL;
+}
+
+int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list,
+                                           size_t list_len) {
+  return set_signed_cert_timestamp_list(ctx->cert, list, list_len);
+}
+
+int SSL_set_signed_cert_timestamp_list(SSL *ssl, const uint8_t *list,
+                                       size_t list_len) {
+  return set_signed_cert_timestamp_list(ssl->cert, list, list_len);
+}
+
+int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response,
+                              size_t response_len) {
+  CRYPTO_BUFFER_free(ctx->cert->ocsp_response);
+  ctx->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
+  return ctx->cert->ocsp_response != NULL;
+}
+
+int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response,
+                          size_t response_len) {
+  CRYPTO_BUFFER_free(ssl->cert->ocsp_response);
+  ssl->cert->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
+  return ssl->cert->ocsp_response != NULL;
+}
diff --git a/src/ssl/ssl_lib.c b/src/ssl/ssl_lib.c
index c946b77..d0151bb 100644
--- a/src/ssl/ssl_lib.c
+++ b/src/ssl/ssl_lib.c
@@ -363,8 +363,6 @@
   OPENSSL_free(ctx->psk_identity_hint);
   OPENSSL_free(ctx->supported_group_list);
   OPENSSL_free(ctx->alpn_client_proto_list);
-  CRYPTO_BUFFER_free(ctx->signed_cert_timestamp_list);
-  CRYPTO_BUFFER_free(ctx->ocsp_response);
   EVP_PKEY_free(ctx->tlsext_channel_id_private);
 
   OPENSSL_free(ctx);
@@ -405,9 +403,6 @@
   ssl->msg_callback = ctx->msg_callback;
   ssl->msg_callback_arg = ctx->msg_callback_arg;
   ssl->verify_mode = ctx->verify_mode;
-  ssl->sid_ctx_length = ctx->sid_ctx_length;
-  assert(ssl->sid_ctx_length <= sizeof ssl->sid_ctx);
-  OPENSSL_memcpy(&ssl->sid_ctx, &ctx->sid_ctx, sizeof(ssl->sid_ctx));
   ssl->verify_callback = ctx->default_verify_callback;
   ssl->retain_only_sha256_of_client_certs =
       ctx->retain_only_sha256_of_client_certs;
@@ -472,18 +467,6 @@
   ssl->signed_cert_timestamps_enabled = ctx->signed_cert_timestamps_enabled;
   ssl->ocsp_stapling_enabled = ctx->ocsp_stapling_enabled;
 
-  /* If the context has an SCT list, use it. */
-  if (ctx->signed_cert_timestamp_list != NULL) {
-    CRYPTO_BUFFER_up_ref(ctx->signed_cert_timestamp_list);
-    ssl->signed_cert_timestamp_list = ctx->signed_cert_timestamp_list;
-  }
-
-  /* If the context has an OCSP response, use it. */
-  if (ctx->ocsp_response != NULL) {
-    CRYPTO_BUFFER_up_ref(ctx->ocsp_response);
-    ssl->ocsp_response = ctx->ocsp_response;
-  }
-
   return ssl;
 
 err:
@@ -522,8 +505,6 @@
   OPENSSL_free(ssl->psk_identity_hint);
   sk_X509_NAME_pop_free(ssl->client_CA, X509_NAME_free);
   sk_SRTP_PROTECTION_PROFILE_free(ssl->srtp_profiles);
-  CRYPTO_BUFFER_free(ssl->signed_cert_timestamp_list);
-  CRYPTO_BUFFER_free(ssl->ocsp_response);
 
   if (ssl->method != NULL) {
     ssl->method->ssl_free(ssl);
@@ -800,10 +781,11 @@
     return -1;
   }
 
-  /* We can't shutdown properly if we are in the middle of a handshake. */
+  /* If we are in the middle of a handshake, silently succeed. Consumers often
+   * call this function before |SSL_free|, whether the handshake succeeded or
+   * not. We assume the caller has already handled failed handshakes. */
   if (SSL_in_init(ssl)) {
-    OPENSSL_PUT_ERROR(SSL, SSL_R_SHUTDOWN_WHILE_IN_INIT);
-    return -1;
+    return 1;
   }
 
   if (ssl->quiet_shutdown) {
@@ -1088,37 +1070,32 @@
   return 0;
 }
 
-int SSL_CTX_set_session_id_context(SSL_CTX *ctx, const uint8_t *sid_ctx,
+static int set_session_id_context(CERT *cert, const uint8_t *sid_ctx,
                                    size_t sid_ctx_len) {
-  if (sid_ctx_len > sizeof(ctx->sid_ctx)) {
+  if (sid_ctx_len > sizeof(cert->sid_ctx)) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
     return 0;
   }
 
-  assert(sizeof(ctx->sid_ctx) < 256);
-  ctx->sid_ctx_length = (uint8_t)sid_ctx_len;
-  OPENSSL_memcpy(ctx->sid_ctx, sid_ctx, sid_ctx_len);
-
+  OPENSSL_COMPILE_ASSERT(sizeof(cert->sid_ctx) < 256, sid_ctx_too_large);
+  cert->sid_ctx_length = (uint8_t)sid_ctx_len;
+  OPENSSL_memcpy(cert->sid_ctx, sid_ctx, sid_ctx_len);
   return 1;
 }
 
+int SSL_CTX_set_session_id_context(SSL_CTX *ctx, const uint8_t *sid_ctx,
+                                   size_t sid_ctx_len) {
+  return set_session_id_context(ctx->cert, sid_ctx, sid_ctx_len);
+}
+
 int SSL_set_session_id_context(SSL *ssl, const uint8_t *sid_ctx,
                                size_t sid_ctx_len) {
-  if (sid_ctx_len > sizeof(ssl->sid_ctx)) {
-    OPENSSL_PUT_ERROR(SSL, SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
-    return 0;
-  }
-
-  assert(sizeof(ssl->sid_ctx) < 256);
-  ssl->sid_ctx_length = (uint8_t)sid_ctx_len;
-  OPENSSL_memcpy(ssl->sid_ctx, sid_ctx, sid_ctx_len);
-
-  return 1;
+  return set_session_id_context(ssl->cert, sid_ctx, sid_ctx_len);
 }
 
 const uint8_t *SSL_get0_session_id_context(const SSL *ssl, size_t *out_len) {
-  *out_len = ssl->sid_ctx_length;
-  return ssl->sid_ctx;
+  *out_len = ssl->cert->sid_ctx_length;
+  return ssl->cert->sid_ctx;
 }
 
 void ssl_cipher_preference_list_free(
@@ -1247,11 +1224,26 @@
 int SSL_get_verify_mode(const SSL *ssl) { return ssl->verify_mode; }
 
 int SSL_get_extms_support(const SSL *ssl) {
+  /* TLS 1.3 does not require extended master secret and always reports as
+   * supporting it. */
   if (!ssl->s3->have_version) {
     return 0;
   }
-  return ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
-         ssl->s3->tmp.extended_master_secret == 1;
+  if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION) {
+    return 1;
+  }
+
+  /* If the initial handshake completed, query the established session. */
+  if (ssl->s3->established_session != NULL) {
+    return ssl->s3->established_session->extended_master_secret;
+  }
+
+  /* Otherwise, query the in-progress handshake. */
+  if (ssl->s3->hs != NULL) {
+    return ssl->s3->hs->extended_master_secret;
+  }
+  assert(0);
+  return 0;
 }
 
 int SSL_CTX_get_read_ahead(const SSL_CTX *ctx) { return 0; }
@@ -1583,18 +1575,16 @@
   ctx->signed_cert_timestamps_enabled = 1;
 }
 
-int SSL_enable_signed_cert_timestamps(SSL *ssl) {
+void SSL_enable_signed_cert_timestamps(SSL *ssl) {
   ssl->signed_cert_timestamps_enabled = 1;
-  return 1;
 }
 
 void SSL_CTX_enable_ocsp_stapling(SSL_CTX *ctx) {
   ctx->ocsp_stapling_enabled = 1;
 }
 
-int SSL_enable_ocsp_stapling(SSL *ssl) {
+void SSL_enable_ocsp_stapling(SSL *ssl) {
   ssl->ocsp_stapling_enabled = 1;
-  return 1;
 }
 
 void SSL_get0_signed_cert_timestamp_list(const SSL *ssl, const uint8_t **out,
@@ -1624,52 +1614,6 @@
   *out_len = session->ocsp_response_length;
 }
 
-int SSL_CTX_set_signed_cert_timestamp_list(SSL_CTX *ctx, const uint8_t *list,
-                                           size_t list_len) {
-  CBS sct_list;
-  CBS_init(&sct_list, list, list_len);
-  if (!ssl_is_sct_list_valid(&sct_list)) {
-    OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST);
-    return 0;
-  }
-
-  CRYPTO_BUFFER_free(ctx->signed_cert_timestamp_list);
-  ctx->signed_cert_timestamp_list = CRYPTO_BUFFER_new(CBS_data(&sct_list),
-                                                      CBS_len(&sct_list),
-                                                      NULL);
-  return ctx->signed_cert_timestamp_list != NULL;
-}
-
-int SSL_set_signed_cert_timestamp_list(SSL *ssl, const uint8_t *list,
-                                       size_t list_len) {
-  CBS sct_list;
-  CBS_init(&sct_list, list, list_len);
-  if (!ssl_is_sct_list_valid(&sct_list)) {
-    OPENSSL_PUT_ERROR(SSL, SSL_R_INVALID_SCT_LIST);
-    return 0;
-  }
-
-  CRYPTO_BUFFER_free(ssl->signed_cert_timestamp_list);
-  ssl->signed_cert_timestamp_list = CRYPTO_BUFFER_new(CBS_data(&sct_list),
-                                                      CBS_len(&sct_list),
-                                                      NULL);
-  return ssl->signed_cert_timestamp_list != NULL;
-}
-
-int SSL_CTX_set_ocsp_response(SSL_CTX *ctx, const uint8_t *response,
-                              size_t response_len) {
-  CRYPTO_BUFFER_free(ctx->ocsp_response);
-  ctx->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
-  return ctx->ocsp_response != NULL;
-}
-
-int SSL_set_ocsp_response(SSL *ssl, const uint8_t *response,
-                          size_t response_len) {
-  CRYPTO_BUFFER_free(ssl->ocsp_response);
-  ssl->ocsp_response = CRYPTO_BUFFER_new(response, response_len, NULL);
-  return ssl->ocsp_response != NULL;
-}
-
 int SSL_set_tlsext_host_name(SSL *ssl, const char *name) {
   OPENSSL_free(ssl->tlsext_hostname);
   ssl->tlsext_hostname = NULL;
@@ -2076,10 +2020,6 @@
   SSL_CTX_free(ssl->ctx);
   ssl->ctx = ctx;
 
-  ssl->sid_ctx_length = ctx->sid_ctx_length;
-  assert(ssl->sid_ctx_length <= sizeof(ssl->sid_ctx));
-  OPENSSL_memcpy(ssl->sid_ctx, ctx->sid_ctx, sizeof(ssl->sid_ctx));
-
   return ssl->ctx;
 }
 
@@ -2094,12 +2034,7 @@
 }
 
 int SSL_state(const SSL *ssl) {
-  if (ssl->s3->hs == NULL) {
-    assert(ssl->s3->initial_handshake_complete);
-    return SSL_ST_OK;
-  }
-
-  return ssl->s3->hs->state;
+  return SSL_in_init(ssl) ? SSL_ST_INIT : SSL_ST_OK;
 }
 
 void SSL_set_state(SSL *ssl, int state) { }
@@ -2345,11 +2280,12 @@
 }
 
 int SSL_is_init_finished(const SSL *ssl) {
-  return SSL_state(ssl) == SSL_ST_OK;
+  return !SSL_in_init(ssl);
 }
 
 int SSL_in_init(const SSL *ssl) {
-  return (SSL_state(ssl) & SSL_ST_INIT) != 0;
+  SSL_HANDSHAKE *hs = ssl->s3->hs;
+  return hs != NULL && hs->state != SSL_ST_OK;
 }
 
 int SSL_in_false_start(const SSL *ssl) {
@@ -2575,10 +2511,11 @@
 }
 
 const SSL_CIPHER *SSL_get_pending_cipher(const SSL *ssl) {
-  if (!SSL_in_init(ssl)) {
+  SSL_HANDSHAKE *hs = ssl->s3->hs;
+  if (hs == NULL) {
     return NULL;
   }
-  return ssl->s3->tmp.new_cipher;
+  return hs->new_cipher;
 }
 
 void SSL_set_retain_only_sha256_of_client_certs(SSL *ssl, int enabled) {
diff --git a/src/ssl/ssl_session.c b/src/ssl/ssl_session.c
index b71b994..bbe88c3 100644
--- a/src/ssl/ssl_session.c
+++ b/src/ssl/ssl_session.c
@@ -280,6 +280,15 @@
     new_session->ticket_age_add = session->ticket_age_add;
     new_session->ticket_max_early_data = session->ticket_max_early_data;
     new_session->extended_master_secret = session->extended_master_secret;
+
+    if (session->early_alpn != NULL) {
+      new_session->early_alpn =
+          BUF_memdup(session->early_alpn, session->early_alpn_len);
+      if (new_session->early_alpn == NULL) {
+        goto err;
+      }
+    }
+    new_session->early_alpn_len = session->early_alpn_len;
   }
 
   /* Copy the ticket. */
@@ -373,6 +382,7 @@
   OPENSSL_free(session->tlsext_signed_cert_timestamp_list);
   OPENSSL_free(session->ocsp_response);
   OPENSSL_free(session->psk_identity);
+  OPENSSL_free(session->early_alpn);
   OPENSSL_cleanse(session, sizeof(*session));
   OPENSSL_free(session);
 }
@@ -458,8 +468,8 @@
   if (!SSL_in_init(ssl)) {
     return ssl->s3->established_session;
   }
-  if (ssl->s3->new_session != NULL) {
-    return ssl->s3->new_session;
+  if (ssl->s3->hs->new_session != NULL) {
+    return ssl->s3->hs->new_session;
   }
   return ssl->session;
 }
@@ -550,19 +560,20 @@
     session->session_id_length = 0;
   }
 
-  if (ssl->sid_ctx_length > sizeof(session->sid_ctx)) {
+  if (ssl->cert->sid_ctx_length > sizeof(session->sid_ctx)) {
     OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
     goto err;
   }
-  OPENSSL_memcpy(session->sid_ctx, ssl->sid_ctx, ssl->sid_ctx_length);
-  session->sid_ctx_length = ssl->sid_ctx_length;
+  OPENSSL_memcpy(session->sid_ctx, ssl->cert->sid_ctx,
+                 ssl->cert->sid_ctx_length);
+  session->sid_ctx_length = ssl->cert->sid_ctx_length;
 
   /* The session is marked not resumable until it is completely filled in. */
   session->not_resumable = 1;
   session->verify_result = X509_V_ERR_INVALID_CALL;
 
-  SSL_SESSION_free(ssl->s3->new_session);
-  ssl->s3->new_session = session;
+  SSL_SESSION_free(hs->new_session);
+  hs->new_session = session;
   ssl_set_session(ssl, NULL);
   return 1;
 
@@ -668,9 +679,9 @@
     return 0;
   }
 
-  return session->sid_ctx_length == ssl->sid_ctx_length &&
-         OPENSSL_memcmp(session->sid_ctx, ssl->sid_ctx, ssl->sid_ctx_length) ==
-             0;
+  return session->sid_ctx_length == ssl->cert->sid_ctx_length &&
+         OPENSSL_memcmp(session->sid_ctx, ssl->cert->sid_ctx,
+                        ssl->cert->sid_ctx_length) == 0;
 }
 
 int ssl_session_is_time_valid(const SSL *ssl, const SSL_SESSION *session) {
@@ -689,18 +700,20 @@
   return session->timeout > (long)now.tv_sec - session->time;
 }
 
-int ssl_session_is_resumable(const SSL *ssl, const SSL_SESSION *session) {
+int ssl_session_is_resumable(const SSL_HANDSHAKE *hs,
+                             const SSL_SESSION *session) {
+  const SSL *const ssl = hs->ssl;
   return ssl_session_is_context_valid(ssl, session) &&
          /* The session must have been created by the same type of end point as
           * we're now using it with. */
-         session->is_server == ssl->server &&
+         ssl->server == session->is_server &&
          /* The session must not be expired. */
          ssl_session_is_time_valid(ssl, session) &&
          /* Only resume if the session's version matches the negotiated
            * version. */
          ssl->version == session->ssl_version &&
          /* Only resume if the session's cipher matches the negotiated one. */
-         ssl->s3->tmp.new_cipher == session->cipher &&
+         hs->new_cipher == session->cipher &&
          /* If the session contains a client certificate (either the full
           * certificate or just the hash) then require that the form of the
           * certificate matches the current configuration. */
@@ -898,7 +911,9 @@
 
 int SSL_set_session(SSL *ssl, SSL_SESSION *session) {
   /* SSL_set_session may only be called before the handshake has started. */
-  if (SSL_state(ssl) != SSL_ST_INIT || ssl->s3->initial_handshake_complete) {
+  if (ssl->s3->initial_handshake_complete ||
+      ssl->s3->hs == NULL ||
+      ssl->s3->hs->state != SSL_ST_INIT) {
     abort();
   }
 
diff --git a/src/ssl/ssl_stat.c b/src/ssl/ssl_stat.c
index 479288a..571b4a9 100644
--- a/src/ssl/ssl_stat.c
+++ b/src/ssl/ssl_stat.c
@@ -83,11 +83,22 @@
 
 #include <openssl/ssl.h>
 
+#include <assert.h>
+
 #include "internal.h"
 
 
+static int ssl_state(const SSL *ssl) {
+  if (ssl->s3->hs == NULL) {
+    assert(ssl->s3->initial_handshake_complete);
+    return SSL_ST_OK;
+  }
+
+  return ssl->s3->hs->state;
+}
+
 const char *SSL_state_string_long(const SSL *ssl) {
-  switch (SSL_state(ssl)) {
+  switch (ssl_state(ssl)) {
     case SSL_ST_ACCEPT:
       return "before accept initialization";
 
@@ -203,7 +214,7 @@
 }
 
 const char *SSL_state_string(const SSL *ssl) {
-  switch (SSL_state(ssl)) {
+  switch (ssl_state(ssl)) {
     case SSL_ST_ACCEPT:
       return "AINIT ";
 
diff --git a/src/ssl/ssl_test.cc b/src/ssl/ssl_test.cc
index dfab976..4e0c274 100644
--- a/src/ssl/ssl_test.cc
+++ b/src/ssl/ssl_test.cc
@@ -2415,6 +2415,9 @@
   // Test that switching the |SSL_CTX| at the SNI callback behaves correctly.
   static const uint16_t kECDSAWithSHA256 = SSL_SIGN_ECDSA_SECP256R1_SHA256;
 
+  static const uint8_t kSCTList[] = {0, 6, 0, 4, 5, 6, 7, 8};
+  static const uint8_t kOCSPResponse[] = {1, 2, 3, 4};
+
   bssl::UniquePtr<SSL_CTX> server_ctx(SSL_CTX_new(method));
   bssl::UniquePtr<SSL_CTX> server_ctx2(SSL_CTX_new(method));
   bssl::UniquePtr<SSL_CTX> client_ctx(SSL_CTX_new(method));
@@ -2423,6 +2426,10 @@
       !SSL_CTX_use_PrivateKey(server_ctx.get(), key.get()) ||
       !SSL_CTX_use_certificate(server_ctx2.get(), cert2.get()) ||
       !SSL_CTX_use_PrivateKey(server_ctx2.get(), key2.get()) ||
+      !SSL_CTX_set_signed_cert_timestamp_list(server_ctx2.get(), kSCTList,
+                                              sizeof(kSCTList)) ||
+      !SSL_CTX_set_ocsp_response(server_ctx2.get(), kOCSPResponse,
+                                 sizeof(kOCSPResponse)) ||
       // Historically signing preferences would be lost in some cases with the
       // SNI callback, which triggers the TLS 1.2 SHA-1 default. To ensure
       // this doesn't happen when |version| is TLS 1.2, configure the private
@@ -2441,6 +2448,9 @@
   SSL_CTX_set_tlsext_servername_callback(server_ctx.get(), SwitchContext);
   SSL_CTX_set_tlsext_servername_arg(server_ctx.get(), server_ctx2.get());
 
+  SSL_CTX_enable_signed_cert_timestamps(client_ctx.get());
+  SSL_CTX_enable_ocsp_stapling(client_ctx.get());
+
   bssl::UniquePtr<SSL> client, server;
   if (!ConnectClientAndServer(&client, &server, client_ctx.get(),
                               server_ctx.get(), nullptr)) {
@@ -2455,6 +2465,22 @@
     return false;
   }
 
+  // The client should have received |server_ctx2|'s SCT list.
+  const uint8_t *data;
+  size_t len;
+  SSL_get0_signed_cert_timestamp_list(client.get(), &data, &len);
+  if (Bytes(kSCTList) != Bytes(data, len)) {
+    fprintf(stderr, "Incorrect SCT list received.\n");
+    return false;
+  }
+
+  // The client should have received |server_ctx2|'s OCSP response.
+  SSL_get0_ocsp_response(client.get(), &data, &len);
+  if (Bytes(kOCSPResponse) != Bytes(data, len)) {
+    fprintf(stderr, "Incorrect OCSP response received.\n");
+    return false;
+  }
+
   return true;
 }
 
diff --git a/src/ssl/t1_enc.c b/src/ssl/t1_enc.c
index d01992e..9f11e05 100644
--- a/src/ssl/t1_enc.c
+++ b/src/ssl/t1_enc.c
@@ -330,8 +330,8 @@
   }
 
   SSL_SESSION *session = ssl->session;
-  if (ssl->s3->new_session != NULL) {
-    session = ssl->s3->new_session;
+  if (hs->new_session != NULL) {
+    session = hs->new_session;
   }
 
   const EVP_AEAD *aead = NULL;
@@ -427,10 +427,9 @@
     iv = server_write_iv;
   }
 
-  SSL_AEAD_CTX *aead_ctx =
-      SSL_AEAD_CTX_new(is_read ? evp_aead_open : evp_aead_seal,
-                       ssl3_protocol_version(ssl), ssl->s3->tmp.new_cipher, key,
-                       key_len, mac_secret, mac_secret_len, iv, iv_len);
+  SSL_AEAD_CTX *aead_ctx = SSL_AEAD_CTX_new(
+      is_read ? evp_aead_open : evp_aead_seal, ssl3_protocol_version(ssl),
+      hs->new_cipher, key, key_len, mac_secret, mac_secret_len, iv, iv_len);
   if (aead_ctx == NULL) {
     return 0;
   }
@@ -474,7 +473,7 @@
                                 const uint8_t *premaster,
                                 size_t premaster_len) {
   const SSL *ssl = hs->ssl;
-  if (ssl->s3->tmp.extended_master_secret) {
+  if (hs->extended_master_secret) {
     uint8_t digests[EVP_MAX_MD_SIZE];
     size_t digests_len;
     if (!SSL_TRANSCRIPT_get_hash(&hs->transcript, digests, &digests_len) ||
diff --git a/src/ssl/t1_lib.c b/src/ssl/t1_lib.c
index 7723ccd..d6ef1ff 100644
--- a/src/ssl/t1_lib.c
+++ b/src/ssl/t1_lib.c
@@ -616,9 +616,9 @@
   assert(ssl->tlsext_hostname != NULL);
 
   if (ssl->session == NULL) {
-    OPENSSL_free(ssl->s3->new_session->tlsext_hostname);
-    ssl->s3->new_session->tlsext_hostname = BUF_strdup(ssl->tlsext_hostname);
-    if (!ssl->s3->new_session->tlsext_hostname) {
+    OPENSSL_free(hs->new_session->tlsext_hostname);
+    hs->new_session->tlsext_hostname = BUF_strdup(ssl->tlsext_hostname);
+    if (!hs->new_session->tlsext_hostname) {
       *out_alert = SSL_AD_INTERNAL_ERROR;
       return 0;
     }
@@ -870,38 +870,32 @@
 static int ext_ems_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
                                      CBS *contents) {
   SSL *const ssl = hs->ssl;
-  /* Whether EMS is negotiated may not change on renegotation. */
-  if (ssl->s3->initial_handshake_complete) {
-    if ((contents != NULL) != ssl->s3->tmp.extended_master_secret) {
-      OPENSSL_PUT_ERROR(SSL, SSL_R_RENEGOTIATION_EMS_MISMATCH);
-      *out_alert = SSL_AD_ILLEGAL_PARAMETER;
+
+  if (contents != NULL) {
+    if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
+        ssl->version == SSL3_VERSION ||
+        CBS_len(contents) != 0) {
       return 0;
     }
 
-    return 1;
+    hs->extended_master_secret = 1;
   }
 
-  if (contents == NULL) {
-    return 1;
-  }
-
-  if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
-      ssl->version == SSL3_VERSION) {
+  /* Whether EMS is negotiated may not change on renegotiation. */
+  if (ssl->s3->established_session != NULL &&
+      hs->extended_master_secret !=
+          ssl->s3->established_session->extended_master_secret) {
+    OPENSSL_PUT_ERROR(SSL, SSL_R_RENEGOTIATION_EMS_MISMATCH);
+    *out_alert = SSL_AD_ILLEGAL_PARAMETER;
     return 0;
   }
 
-  if (CBS_len(contents) != 0) {
-    return 0;
-  }
-
-  ssl->s3->tmp.extended_master_secret = 1;
   return 1;
 }
 
 static int ext_ems_parse_clienthello(SSL_HANDSHAKE *hs, uint8_t *out_alert,
                                      CBS *contents) {
-  SSL *const ssl = hs->ssl;
-  uint16_t version = ssl3_protocol_version(ssl);
+  uint16_t version = ssl3_protocol_version(hs->ssl);
   if (version >= TLS1_3_VERSION ||
       version == SSL3_VERSION) {
     return 1;
@@ -915,12 +909,12 @@
     return 0;
   }
 
-  ssl->s3->tmp.extended_master_secret = 1;
+  hs->extended_master_secret = 1;
   return 1;
 }
 
 static int ext_ems_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
-  if (!hs->ssl->s3->tmp.extended_master_secret) {
+  if (!hs->extended_master_secret) {
     return 1;
   }
 
@@ -1118,7 +1112,7 @@
 
   /* OCSP stapling is forbidden on non-certificate ciphers. */
   if (CBS_len(contents) != 0 ||
-      !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+      !ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
     return 0;
   }
 
@@ -1152,9 +1146,9 @@
   SSL *const ssl = hs->ssl;
   if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
       !hs->ocsp_stapling_requested ||
-      ssl->ocsp_response == NULL ||
+      ssl->cert->ocsp_response == NULL ||
       ssl->s3->session_reused ||
-      !ssl_cipher_uses_certificate_auth(ssl->s3->tmp.new_cipher)) {
+      !ssl_cipher_uses_certificate_auth(hs->new_cipher)) {
     return 1;
   }
 
@@ -1341,10 +1335,8 @@
    *
    * TODO(davidben): Enforce this anyway. */
   if (!ssl->s3->session_reused &&
-      !CBS_stow(
-          contents,
-          &ssl->s3->new_session->tlsext_signed_cert_timestamp_list,
-          &ssl->s3->new_session->tlsext_signed_cert_timestamp_list_length)) {
+      !CBS_stow(contents, &hs->new_session->tlsext_signed_cert_timestamp_list,
+                &hs->new_session->tlsext_signed_cert_timestamp_list_length)) {
     *out_alert = SSL_AD_INTERNAL_ERROR;
     return 0;
   }
@@ -1371,16 +1363,17 @@
   /* The extension shouldn't be sent when resuming sessions. */
   if (ssl3_protocol_version(ssl) >= TLS1_3_VERSION ||
       ssl->s3->session_reused ||
-      ssl->signed_cert_timestamp_list == NULL) {
+      ssl->cert->signed_cert_timestamp_list == NULL) {
     return 1;
   }
 
   CBB contents;
   return CBB_add_u16(out, TLSEXT_TYPE_certificate_timestamp) &&
          CBB_add_u16_length_prefixed(out, &contents) &&
-         CBB_add_bytes(&contents,
-                       CRYPTO_BUFFER_data(ssl->signed_cert_timestamp_list),
-                       CRYPTO_BUFFER_len(ssl->signed_cert_timestamp_list)) &&
+         CBB_add_bytes(
+             &contents,
+             CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list),
+             CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) &&
          CBB_flush(out);
 }
 
@@ -1852,8 +1845,8 @@
     return 1;
   }
 
-  const uint32_t alg_k = ssl->s3->tmp.new_cipher->algorithm_mkey;
-  const uint32_t alg_a = ssl->s3->tmp.new_cipher->algorithm_auth;
+  const uint32_t alg_k = hs->new_cipher->algorithm_mkey;
+  const uint32_t alg_a = hs->new_cipher->algorithm_auth;
   const int using_ecc = (alg_k & SSL_kECDHE) || (alg_a & SSL_aECDSA);
 
   if (!using_ecc) {
@@ -2218,7 +2211,6 @@
 int ssl_ext_key_share_parse_serverhello(SSL_HANDSHAKE *hs, uint8_t **out_secret,
                                         size_t *out_secret_len,
                                         uint8_t *out_alert, CBS *contents) {
-  SSL *const ssl = hs->ssl;
   CBS peer_key;
   uint16_t group_id;
   if (!CBS_get_u16(contents, &group_id) ||
@@ -2240,7 +2232,7 @@
     return 0;
   }
 
-  ssl->s3->new_session->group_id = group_id;
+  hs->new_session->group_id = group_id;
   SSL_ECDH_CTX_cleanup(&hs->ecdh_ctx);
   return 1;
 }
@@ -2322,7 +2314,6 @@
 }
 
 int ssl_ext_key_share_add_serverhello(SSL_HANDSHAKE *hs, CBB *out) {
-  SSL *const ssl = hs->ssl;
   uint16_t group_id;
   CBB kse_bytes, public_key;
   if (!tls1_get_shared_group(hs, &group_id) ||
@@ -2339,7 +2330,7 @@
   hs->public_key = NULL;
   hs->public_key_len = 0;
 
-  ssl->s3->new_session->group_id = group_id;
+  hs->new_session->group_id = group_id;
   return 1;
 }
 
@@ -3518,7 +3509,7 @@
 }
 
 /* tls1_record_handshake_hashes_for_channel_id records the current handshake
- * hashes in |ssl->s3->new_session| so that Channel ID resumptions can sign that
+ * hashes in |hs->new_session| so that Channel ID resumptions can sign that
  * data. */
 int tls1_record_handshake_hashes_for_channel_id(SSL_HANDSHAKE *hs) {
   SSL *const ssl = hs->ssl;
@@ -3530,18 +3521,18 @@
   }
 
   OPENSSL_COMPILE_ASSERT(
-      sizeof(ssl->s3->new_session->original_handshake_hash) == EVP_MAX_MD_SIZE,
+      sizeof(hs->new_session->original_handshake_hash) == EVP_MAX_MD_SIZE,
       original_handshake_hash_is_too_small);
 
   size_t digest_len;
   if (!SSL_TRANSCRIPT_get_hash(&hs->transcript,
-                               ssl->s3->new_session->original_handshake_hash,
+                               hs->new_session->original_handshake_hash,
                                &digest_len)) {
     return -1;
   }
 
   OPENSSL_COMPILE_ASSERT(EVP_MAX_MD_SIZE <= 0xff, max_md_size_is_too_large);
-  ssl->s3->new_session->original_handshake_hash_len = (uint8_t)digest_len;
+  hs->new_session->original_handshake_hash_len = (uint8_t)digest_len;
 
   return 1;
 }
diff --git a/src/ssl/test/bssl_shim.cc b/src/ssl/test/bssl_shim.cc
index 381f4c2..dd61ffb 100644
--- a/src/ssl/test/bssl_shim.cc
+++ b/src/ssl/test/bssl_shim.cc
@@ -1584,13 +1584,11 @@
       !SSL_set_srtp_profiles(ssl.get(), config->srtp_profiles.c_str())) {
     return false;
   }
-  if (config->enable_ocsp_stapling &&
-      !SSL_enable_ocsp_stapling(ssl.get())) {
-    return false;
+  if (config->enable_ocsp_stapling) {
+    SSL_enable_ocsp_stapling(ssl.get());
   }
-  if (config->enable_signed_cert_timestamps &&
-      !SSL_enable_signed_cert_timestamps(ssl.get())) {
-    return false;
+  if (config->enable_signed_cert_timestamps) {
+    SSL_enable_signed_cert_timestamps(ssl.get());
   }
   if (config->min_version != 0 &&
       !SSL_set_min_proto_version(ssl.get(), (uint16_t)config->min_version)) {
diff --git a/src/ssl/test/runner/runner.go b/src/ssl/test/runner/runner.go
index d6e984a..d7bad5b 100644
--- a/src/ssl/test/runner/runner.go
+++ b/src/ssl/test/runner/runner.go
@@ -6395,7 +6395,7 @@
 		// this case. https://crbug.com/boringssl/130
 	})
 
-	// Stray HelloRequests during the handshake are ignored in TLS 1.2.
+	// We reject stray HelloRequests during the handshake in TLS 1.2.
 	testCases = append(testCases, testCase{
 		name: "StrayHelloRequest",
 		config: Config{
@@ -6404,6 +6404,8 @@
 				SendHelloRequestBeforeEveryHandshakeMessage: true,
 			},
 		},
+		shouldFail:    true,
+		expectedError: ":UNEXPECTED_MESSAGE:",
 	})
 	testCases = append(testCases, testCase{
 		name: "StrayHelloRequest-Packed",
@@ -6414,6 +6416,8 @@
 				SendHelloRequestBeforeEveryHandshakeMessage: true,
 			},
 		},
+		shouldFail:    true,
+		expectedError: ":UNEXPECTED_MESSAGE:",
 	})
 
 	// Test renegotiation works if HelloRequest and server Finished come in
diff --git a/src/ssl/tls13_both.c b/src/ssl/tls13_both.c
index 19dd555..91cae9a 100644
--- a/src/ssl/tls13_both.c
+++ b/src/ssl/tls13_both.c
@@ -211,7 +211,7 @@
       if (retain_sha256) {
         /* Retain the hash of the leaf certificate if requested. */
         SHA256(CBS_data(&certificate), CBS_len(&certificate),
-               ssl->s3->new_session->peer_sha256);
+               hs->new_session->peer_sha256);
       }
     }
 
@@ -262,8 +262,8 @@
       }
 
       if (sk_CRYPTO_BUFFER_num(certs) == 1 &&
-          !CBS_stow(&ocsp_response, &ssl->s3->new_session->ocsp_response,
-                    &ssl->s3->new_session->ocsp_response_length)) {
+          !CBS_stow(&ocsp_response, &hs->new_session->ocsp_response,
+                    &hs->new_session->ocsp_response_length)) {
         ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
         goto err;
       }
@@ -283,10 +283,9 @@
       }
 
       if (sk_CRYPTO_BUFFER_num(certs) == 1 &&
-          !CBS_stow(&sct,
-                    &ssl->s3->new_session->tlsext_signed_cert_timestamp_list,
-                    &ssl->s3->new_session
-                         ->tlsext_signed_cert_timestamp_list_length)) {
+          !CBS_stow(
+              &sct, &hs->new_session->tlsext_signed_cert_timestamp_list,
+              &hs->new_session->tlsext_signed_cert_timestamp_list_length)) {
         ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
         goto err;
       }
@@ -303,17 +302,17 @@
   hs->peer_pubkey = pkey;
   pkey = NULL;
 
-  sk_CRYPTO_BUFFER_pop_free(ssl->s3->new_session->certs, CRYPTO_BUFFER_free);
-  ssl->s3->new_session->certs = certs;
+  sk_CRYPTO_BUFFER_pop_free(hs->new_session->certs, CRYPTO_BUFFER_free);
+  hs->new_session->certs = certs;
   certs = NULL;
 
-  if (!ssl->ctx->x509_method->session_cache_objects(ssl->s3->new_session)) {
+  if (!ssl->ctx->x509_method->session_cache_objects(hs->new_session)) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_DECODE_ERROR);
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_DECODE_ERROR);
     goto err;
   }
 
-  if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) {
+  if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) {
     if (!allow_anonymous) {
       OPENSSL_PUT_ERROR(SSL, SSL_R_PEER_DID_NOT_RETURN_A_CERTIFICATE);
       ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_CERTIFICATE_REQUIRED);
@@ -322,17 +321,17 @@
 
     /* OpenSSL returns X509_V_OK when no certificates are requested. This is
      * classed by them as a bug, but it's assumed by at least NGINX. */
-    ssl->s3->new_session->verify_result = X509_V_OK;
+    hs->new_session->verify_result = X509_V_OK;
 
     /* No certificate, so nothing more to do. */
     ret = 1;
     goto err;
   }
 
-  ssl->s3->new_session->peer_sha256_valid = retain_sha256;
+  hs->new_session->peer_sha256_valid = retain_sha256;
 
-  if (!ssl_verify_cert_chain(ssl, &ssl->s3->new_session->verify_result,
-                             ssl->s3->new_session->x509_chain)) {
+  if (!ssl_verify_cert_chain(ssl, &hs->new_session->verify_result,
+                             hs->new_session->x509_chain)) {
     goto err;
   }
 
@@ -370,7 +369,7 @@
     ssl3_send_alert(ssl, SSL3_AL_FATAL, al);
     goto err;
   }
-  ssl->s3->new_session->peer_signature_algorithm = signature_algorithm;
+  hs->new_session->peer_signature_algorithm = signature_algorithm;
 
   if (!tls13_get_cert_verify_signature_input(
           hs, &msg, &msg_len,
@@ -452,13 +451,14 @@
     goto err;
   }
 
-  if (hs->scts_requested && ssl->signed_cert_timestamp_list != NULL) {
+  if (hs->scts_requested && ssl->cert->signed_cert_timestamp_list != NULL) {
     CBB contents;
     if (!CBB_add_u16(&extensions, TLSEXT_TYPE_certificate_timestamp) ||
         !CBB_add_u16_length_prefixed(&extensions, &contents) ||
-        !CBB_add_bytes(&contents,
-                       CRYPTO_BUFFER_data(ssl->signed_cert_timestamp_list),
-                       CRYPTO_BUFFER_len(ssl->signed_cert_timestamp_list)) ||
+        !CBB_add_bytes(
+            &contents,
+            CRYPTO_BUFFER_data(ssl->cert->signed_cert_timestamp_list),
+            CRYPTO_BUFFER_len(ssl->cert->signed_cert_timestamp_list)) ||
         !CBB_flush(&extensions)) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
       goto err;
@@ -466,14 +466,15 @@
   }
 
   if (hs->ocsp_stapling_requested &&
-      ssl->ocsp_response != NULL) {
+      ssl->cert->ocsp_response != NULL) {
     CBB contents, ocsp_response;
     if (!CBB_add_u16(&extensions, TLSEXT_TYPE_status_request) ||
         !CBB_add_u16_length_prefixed(&extensions, &contents) ||
         !CBB_add_u8(&contents, TLSEXT_STATUSTYPE_ocsp) ||
         !CBB_add_u24_length_prefixed(&contents, &ocsp_response) ||
-        !CBB_add_bytes(&ocsp_response, CRYPTO_BUFFER_data(ssl->ocsp_response),
-                       CRYPTO_BUFFER_len(ssl->ocsp_response)) ||
+        !CBB_add_bytes(&ocsp_response,
+                       CRYPTO_BUFFER_data(ssl->cert->ocsp_response),
+                       CRYPTO_BUFFER_len(ssl->cert->ocsp_response)) ||
         !CBB_flush(&extensions)) {
       OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
       goto err;
diff --git a/src/ssl/tls13_client.c b/src/ssl/tls13_client.c
index 50f7e5a..8e994e5 100644
--- a/src/ssl/tls13_client.c
+++ b/src/ssl/tls13_client.c
@@ -251,24 +251,34 @@
 
     ssl->s3->session_reused = 1;
     /* Only authentication information carries over in TLS 1.3. */
-    ssl->s3->new_session =
-        SSL_SESSION_dup(ssl->session, SSL_SESSION_DUP_AUTH_ONLY);
-    if (ssl->s3->new_session == NULL) {
+    hs->new_session = SSL_SESSION_dup(ssl->session, SSL_SESSION_DUP_AUTH_ONLY);
+    if (hs->new_session == NULL) {
       ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
       return ssl_hs_error;
     }
     ssl_set_session(ssl, NULL);
 
     /* Resumption incorporates fresh key material, so refresh the timeout. */
-    ssl_session_renew_timeout(ssl, ssl->s3->new_session,
+    ssl_session_renew_timeout(ssl, hs->new_session,
                               ssl->initial_ctx->session_psk_dhe_timeout);
   } else if (!ssl_get_new_session(hs, 0)) {
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
     return ssl_hs_error;
   }
 
-  ssl->s3->new_session->cipher = cipher;
-  ssl->s3->tmp.new_cipher = cipher;
+  hs->new_session->cipher = cipher;
+  hs->new_cipher = cipher;
+
+  /* Store the initial negotiated ALPN in the session. */
+  if (ssl->s3->alpn_selected != NULL) {
+    hs->new_session->early_alpn =
+        BUF_memdup(ssl->s3->alpn_selected, ssl->s3->alpn_selected_len);
+    if (hs->new_session->early_alpn == NULL) {
+      ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
+      return ssl_hs_error;
+    }
+    hs->new_session->early_alpn_len = ssl->s3->alpn_selected_len;
+  }
 
   /* The PRF hash is now known. Set up the key schedule. */
   if (!tls13_init_key_schedule(hs)) {
@@ -277,8 +287,8 @@
 
   /* Incorporate the PSK into the running secret. */
   if (ssl->s3->session_reused) {
-    if (!tls13_advance_key_schedule(hs, ssl->s3->new_session->master_key,
-                                    ssl->s3->new_session->master_key_length)) {
+    if (!tls13_advance_key_schedule(hs, hs->new_session->master_key,
+                                    hs->new_session->master_key_length)) {
       return ssl_hs_error;
     }
   } else if (!tls13_advance_key_schedule(hs, kZeroes, hs->hash_len)) {
diff --git a/src/ssl/tls13_enc.c b/src/ssl/tls13_enc.c
index 4d140e3..412705d 100644
--- a/src/ssl/tls13_enc.c
+++ b/src/ssl/tls13_enc.c
@@ -30,7 +30,7 @@
 
 int tls13_init_key_schedule(SSL_HANDSHAKE *hs) {
   if (!SSL_TRANSCRIPT_init_hash(&hs->transcript, ssl3_protocol_version(hs->ssl),
-                                hs->ssl->s3->tmp.new_cipher->algorithm_prf)) {
+                                hs->new_cipher->algorithm_prf)) {
     return 0;
   }
 
@@ -237,17 +237,15 @@
 static const char kTLS13LabelResumption[] = "resumption master secret";
 
 int tls13_derive_resumption_secret(SSL_HANDSHAKE *hs) {
-  SSL *const ssl = hs->ssl;
-  if (ssl->s3->hs->hash_len > SSL_MAX_MASTER_KEY_LENGTH) {
+  if (hs->hash_len > SSL_MAX_MASTER_KEY_LENGTH) {
     OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
     return 0;
   }
 
-  ssl->s3->new_session->master_key_length = hs->hash_len;
-  return derive_secret(hs, ssl->s3->new_session->master_key,
-                       ssl->s3->new_session->master_key_length,
-                       (const uint8_t *)kTLS13LabelResumption,
-                       strlen(kTLS13LabelResumption));
+  hs->new_session->master_key_length = hs->hash_len;
+  return derive_secret(
+      hs, hs->new_session->master_key, hs->new_session->master_key_length,
+      (const uint8_t *)kTLS13LabelResumption, strlen(kTLS13LabelResumption));
 }
 
 static const char kTLS13LabelFinished[] = "finished";
diff --git a/src/ssl/tls13_server.c b/src/ssl/tls13_server.c
index 0278b50..402c234 100644
--- a/src/ssl/tls13_server.c
+++ b/src/ssl/tls13_server.c
@@ -150,8 +150,8 @@
   }
 
   /* Negotiate the cipher suite. */
-  ssl->s3->tmp.new_cipher = choose_tls13_cipher(ssl, &client_hello);
-  if (ssl->s3->tmp.new_cipher == NULL) {
+  hs->new_cipher = choose_tls13_cipher(ssl, &client_hello);
+  if (hs->new_cipher == NULL) {
     OPENSSL_PUT_ERROR(SSL, SSL_R_NO_SHARED_CIPHER);
     ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
     return ssl_hs_error;
@@ -189,7 +189,7 @@
   }
 
   if (session != NULL &&
-      !ssl_session_is_resumable(ssl, session)) {
+      !ssl_session_is_resumable(hs, session)) {
     SSL_SESSION_free(session);
     session = NULL;
   }
@@ -202,13 +202,13 @@
       return ssl_hs_error;
     }
 
-    ssl->s3->new_session->cipher = ssl->s3->tmp.new_cipher;
+    hs->new_session->cipher = hs->new_cipher;
 
     /* On new sessions, stash the SNI value in the session. */
     if (hs->hostname != NULL) {
-      OPENSSL_free(ssl->s3->new_session->tlsext_hostname);
-      ssl->s3->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
-      if (ssl->s3->new_session->tlsext_hostname == NULL) {
+      OPENSSL_free(hs->new_session->tlsext_hostname);
+      hs->new_session->tlsext_hostname = BUF_strdup(hs->hostname);
+      if (hs->new_session->tlsext_hostname == NULL) {
         ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
         return ssl_hs_error;
       }
@@ -222,8 +222,8 @@
     }
 
     /* Only authentication information carries over in TLS 1.3. */
-    ssl->s3->new_session = SSL_SESSION_dup(session, SSL_SESSION_DUP_AUTH_ONLY);
-    if (ssl->s3->new_session == NULL) {
+    hs->new_session = SSL_SESSION_dup(session, SSL_SESSION_DUP_AUTH_ONLY);
+    if (hs->new_session == NULL) {
       ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
       return ssl_hs_error;
     }
@@ -231,7 +231,7 @@
     SSL_SESSION_free(session);
 
     /* Resumption incorporates fresh key material, so refresh the timeout. */
-    ssl_session_renew_timeout(ssl, ssl->s3->new_session,
+    ssl_session_renew_timeout(ssl, hs->new_session,
                               ssl->initial_ctx->session_psk_dhe_timeout);
   }
 
@@ -251,10 +251,21 @@
     return ssl_hs_error;
   }
 
+  /* Store the initial negotiated ALPN in the session. */
+  if (ssl->s3->alpn_selected != NULL) {
+    hs->new_session->early_alpn =
+        BUF_memdup(ssl->s3->alpn_selected, ssl->s3->alpn_selected_len);
+    if (hs->new_session->early_alpn == NULL) {
+      ssl3_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_INTERNAL_ERROR);
+      return ssl_hs_error;
+    }
+    hs->new_session->early_alpn_len = ssl->s3->alpn_selected_len;
+  }
+
   /* Incorporate the PSK into the running secret. */
   if (ssl->s3->session_reused) {
-    if (!tls13_advance_key_schedule(hs, ssl->s3->new_session->master_key,
-                                    ssl->s3->new_session->master_key_length)) {
+    if (!tls13_advance_key_schedule(hs, hs->new_session->master_key,
+                                    hs->new_session->master_key_length)) {
       return ssl_hs_error;
     }
   } else if (!tls13_advance_key_schedule(hs, kZeroes, hs->hash_len)) {
@@ -340,7 +351,7 @@
       !CBB_add_u16(&body, ssl->version) ||
       !RAND_bytes(ssl->s3->server_random, sizeof(ssl->s3->server_random)) ||
       !CBB_add_bytes(&body, ssl->s3->server_random, SSL3_RANDOM_SIZE) ||
-      !CBB_add_u16(&body, ssl_cipher_get_value(ssl->s3->tmp.new_cipher)) ||
+      !CBB_add_u16(&body, ssl_cipher_get_value(hs->new_cipher)) ||
       !CBB_add_u16_length_prefixed(&body, &extensions) ||
       !ssl_ext_pre_shared_key_add_serverhello(hs, &extensions) ||
       !ssl_ext_key_share_add_serverhello(hs, &extensions)) {
@@ -472,7 +483,7 @@
   if (!hs->cert_request) {
     /* OpenSSL returns X509_V_OK when no certificates are requested. This is
      * classed by them as a bug, but it's assumed by at least NGINX. */
-    ssl->s3->new_session->verify_result = X509_V_OK;
+    hs->new_session->verify_result = X509_V_OK;
 
     /* Skip this state. */
     hs->tls13_state = state_process_channel_id;
@@ -495,7 +506,7 @@
 static enum ssl_hs_wait_t do_process_client_certificate_verify(
     SSL_HANDSHAKE *hs) {
   SSL *const ssl = hs->ssl;
-  if (sk_CRYPTO_BUFFER_num(ssl->s3->new_session->certs) == 0) {
+  if (sk_CRYPTO_BUFFER_num(hs->new_session->certs) == 0) {
     /* Skip this state. */
     hs->tls13_state = state_process_channel_id;
     return ssl_hs_ok;
@@ -543,7 +554,7 @@
 
   /* Rebase the session timestamp so that it is measured from ticket
    * issuance. */
-  ssl_session_rebase_time(ssl, ssl->s3->new_session);
+  ssl_session_rebase_time(ssl, hs->new_session);
   hs->tls13_state = state_send_new_session_ticket;
   return ssl_hs_ok;
 }
@@ -561,7 +572,7 @@
     return ssl_hs_ok;
   }
 
-  SSL_SESSION *session = ssl->s3->new_session;
+  SSL_SESSION *session = hs->new_session;
   CBB cbb;
   CBB_zero(&cbb);
 
diff --git a/src/tool/transport_common.cc b/src/tool/transport_common.cc
index cd3e0d6..5f1a366 100644
--- a/src/tool/transport_common.cc
+++ b/src/tool/transport_common.cc
@@ -285,6 +285,11 @@
     size_t ocsp_staple_len;
     SSL_get0_ocsp_response(ssl, &ocsp_staple, &ocsp_staple_len);
     fprintf(stderr, "  OCSP staple: %s\n", ocsp_staple_len > 0 ? "yes" : "no");
+
+    const uint8_t *sct_list;
+    size_t sct_list_len;
+    SSL_get0_signed_cert_timestamp_list(ssl, &sct_list, &sct_list_len);
+    fprintf(stderr, "  SCT list: %s\n", sct_list_len > 0 ? "yes" : "no");
   }
 
   // Print the server cert subject and issuer names.
diff --git a/src/util/all_tests.json b/src/util/all_tests.json
index 76637b2..fc49c69 100644
--- a/src/util/all_tests.json
+++ b/src/util/all_tests.json
@@ -5,7 +5,6 @@
 	["crypto/bio/bio_test"],
 	["crypto/bn/bn_test", "crypto/bn/bn_tests.txt"],
 	["crypto/bytestring/bytestring_test"],
-	["crypto/chacha/chacha_test"],
 	["crypto/cipher/aead_test", "aes-128-gcm", "crypto/cipher/test/aes_128_gcm_tests.txt"],
 	["crypto/cipher/aead_test", "aes-256-gcm", "crypto/cipher/test/aes_256_gcm_tests.txt"],
 	["crypto/cipher/aead_test", "aes-128-gcm-siv", "crypto/cipher/test/aes_128_gcm_siv_tests.txt"],
@@ -33,7 +32,6 @@
 	["crypto/curve25519/x25519_test"],
 	["crypto/curve25519/spake25519_test"],
 	["crypto/digest/digest_test"],
-	["crypto/ec/ec_test"],
 	["crypto/ec/example_mul"],
 	["crypto/ec/p256-x86_64_test", "crypto/ec/p256-x86_64_tests.txt"],
 	["crypto/ecdh/ecdh_test", "crypto/ecdh/ecdh_tests.txt"],
@@ -53,7 +51,6 @@
 	["crypto/poly1305/poly1305_test", "crypto/poly1305/poly1305_tests.txt"],
 	["crypto/pool/pool_test"],
 	["crypto/refcount_test"],
-	["crypto/rsa/rsa_test"],
 	["crypto/thread_test"],
 	["crypto/x509/pkcs7_test"],
 	["crypto/x509/x509_test"],
diff --git a/src/util/doc.config b/src/util/doc.config
index ddd56db..f7e8baa 100644
--- a/src/util/doc.config
+++ b/src/util/doc.config
@@ -16,8 +16,7 @@
       "include/openssl/obj.h",
       "include/openssl/pool.h",
       "include/openssl/rand.h",
-      "include/openssl/stack.h",
-      "include/openssl/time_support.h"
+      "include/openssl/stack.h"
     ]
   },{
     "Name": "Low-level crypto primitives",
diff --git a/src/util/generate_build_files.py b/src/util/generate_build_files.py
index 8be7c90..a3435f2 100644
--- a/src/util/generate_build_files.py
+++ b/src/util/generate_build_files.py
@@ -50,20 +50,6 @@
     ],
 }
 
-# For now, GTest-based tests are specified manually. Once everything has updated
-# to support GTest, these will be determined automatically by looking for files
-# ending with _test.cc.
-CRYPTO_TEST_SOURCES = [
-    'src/crypto/dh/dh_test.cc',
-    'src/crypto/dsa/dsa_test.cc',
-]
-DECREPIT_TEST_SOURCES = [
-    'src/decrepit/decrepit_test.cc',
-]
-SSL_TEST_SOURCES = [
-    'src/ssl/ssl_test.cc',
-]
-
 PREFIX = None
 
 
@@ -464,13 +450,6 @@
   non-test sources."""
   if is_dir:
     return dent != 'test'
-  # For now, GTest-based tests are specified manually.
-  if dent in [os.path.basename(p) for p in CRYPTO_TEST_SOURCES]:
-    return False
-  if dent in [os.path.basename(p) for p in DECREPIT_TEST_SOURCES]:
-    return False
-  if dent in [os.path.basename(p) for p in SSL_TEST_SOURCES]:
-    return False
   return '_test.' in dent or dent.startswith('example_')
 
 
@@ -624,6 +603,11 @@
   return asmfiles
 
 
+def IsGTest(path):
+  with open(path) as f:
+    return "#include <gtest/gtest.h>" in f.read()
+
+
 def main(platforms):
   crypto_c_files = FindCFiles(os.path.join('src', 'crypto'), NoTests)
   ssl_source_files = FindCFiles(os.path.join('src', 'ssl'), NoTests)
@@ -643,8 +627,17 @@
       FindHeaderFiles(os.path.join('src', 'crypto', 'test'), AllFiles) +
       FindHeaderFiles(os.path.join('src', 'ssl', 'test'), AllFiles))
 
-  test_c_files = FindCFiles(os.path.join('src', 'crypto'), OnlyTests)
-  test_c_files += FindCFiles(os.path.join('src', 'ssl'), OnlyTests)
+  test_c_files = []
+  crypto_test_files = ['src/crypto/test/gtest_main.cc']
+  # TODO(davidben): Remove this loop once all tests are converted.
+  for path in FindCFiles(os.path.join('src', 'crypto'), OnlyTests):
+    if IsGTest(path):
+      crypto_test_files.append(path)
+    else:
+      test_c_files.append(path)
+
+  ssl_test_files = FindCFiles(os.path.join('src', 'ssl'), OnlyTests)
+  ssl_test_files.append('src/crypto/test/gtest_main.cc')
 
   fuzz_c_files = FindCFiles(os.path.join('src', 'fuzz'), NoTests)
 
@@ -689,15 +682,14 @@
       'crypto': crypto_c_files,
       'crypto_headers': crypto_h_files,
       'crypto_internal_headers': crypto_internal_h_files,
-      'crypto_test': sorted(CRYPTO_TEST_SOURCES +
-                            ['src/crypto/test/gtest_main.cc']),
+      'crypto_test': sorted(crypto_test_files),
       'fuzz': fuzz_c_files,
       'ssl': ssl_source_files,
       'ssl_c': [s for s in ssl_source_files if s.endswith('.c')],
       'ssl_cc': [s for s in ssl_source_files if s.endswith('.cc')],
       'ssl_headers': ssl_h_files,
       'ssl_internal_headers': ssl_internal_h_files,
-      'ssl_test': sorted(SSL_TEST_SOURCES + ['src/crypto/test/gtest_main.cc']),
+      'ssl_test': sorted(ssl_test_files),
       'tool': tool_c_files,
       'tool_headers': tool_h_files,
       'test': test_c_files,
diff --git a/win-x86/crypto/bn/x86-mont.asm b/win-x86/crypto/bn/x86-mont.asm
index de7b949..b1a4d59 100644
--- a/win-x86/crypto/bn/x86-mont.asm
+++ b/win-x86/crypto/bn/x86-mont.asm
@@ -29,36 +29,51 @@
 	jl	NEAR L$000just_leave
 	lea	esi,[20+esp]
 	lea	edx,[24+esp]
-	mov	ebp,esp
 	add	edi,2
 	neg	edi
-	lea	esp,[edi*4+esp-32]
+	lea	ebp,[edi*4+esp-32]
 	neg	edi
-	mov	eax,esp
+	mov	eax,ebp
 	sub	eax,edx
 	and	eax,2047
-	sub	esp,eax
-	xor	edx,esp
+	sub	ebp,eax
+	xor	edx,ebp
 	and	edx,2048
 	xor	edx,2048
-	sub	esp,edx
-	and	esp,-64
+	sub	ebp,edx
+	and	ebp,-64
+	mov	eax,esp
+	sub	eax,ebp
+	and	eax,-4096
+	mov	edx,esp
+	lea	esp,[eax*1+ebp]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$001page_walk
+	jmp	NEAR L$002page_walk_done
+align	16
+L$001page_walk:
+	lea	esp,[esp-4096]
+	mov	eax,DWORD [esp]
+	cmp	esp,ebp
+	ja	NEAR L$001page_walk
+L$002page_walk_done:
 	mov	eax,DWORD [esi]
 	mov	ebx,DWORD [4+esi]
 	mov	ecx,DWORD [8+esi]
-	mov	edx,DWORD [12+esi]
+	mov	ebp,DWORD [12+esi]
 	mov	esi,DWORD [16+esi]
 	mov	esi,DWORD [esi]
 	mov	DWORD [4+esp],eax
 	mov	DWORD [8+esp],ebx
 	mov	DWORD [12+esp],ecx
-	mov	DWORD [16+esp],edx
+	mov	DWORD [16+esp],ebp
 	mov	DWORD [20+esp],esi
 	lea	ebx,[edi-3]
-	mov	DWORD [24+esp],ebp
+	mov	DWORD [24+esp],edx
 	lea	eax,[_OPENSSL_ia32cap_P]
 	bt	DWORD [eax],26
-	jnc	NEAR L$001non_sse2
+	jnc	NEAR L$003non_sse2
 	mov	eax,-1
 	movd	mm7,eax
 	mov	esi,DWORD [8+esp]
@@ -82,7 +97,7 @@
 	psrlq	mm3,32
 	inc	ecx
 align	16
-L$0021st:
+L$0041st:
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -97,7 +112,7 @@
 	psrlq	mm3,32
 	lea	ecx,[1+ecx]
 	cmp	ecx,ebx
-	jl	NEAR L$0021st
+	jl	NEAR L$0041st
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -111,7 +126,7 @@
 	paddq	mm3,mm2
 	movq	[32+ebx*4+esp],mm3
 	inc	edx
-L$003outer:
+L$005outer:
 	xor	ecx,ecx
 	movd	mm4,DWORD [edx*4+edi]
 	movd	mm5,DWORD [esi]
@@ -133,7 +148,7 @@
 	paddq	mm2,mm6
 	inc	ecx
 	dec	ebx
-L$004inner:
+L$006inner:
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -150,7 +165,7 @@
 	paddq	mm2,mm6
 	dec	ebx
 	lea	ecx,[1+ecx]
-	jnz	NEAR L$004inner
+	jnz	NEAR L$006inner
 	mov	ebx,ecx
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
@@ -168,11 +183,11 @@
 	movq	[32+ebx*4+esp],mm3
 	lea	edx,[1+edx]
 	cmp	edx,ebx
-	jle	NEAR L$003outer
+	jle	NEAR L$005outer
 	emms
-	jmp	NEAR L$005common_tail
+	jmp	NEAR L$007common_tail
 align	16
-L$001non_sse2:
+L$003non_sse2:
 	mov	esi,DWORD [8+esp]
 	lea	ebp,[1+ebx]
 	mov	edi,DWORD [12+esp]
@@ -183,12 +198,12 @@
 	lea	eax,[4+ebx*4+edi]
 	or	ebp,edx
 	mov	edi,DWORD [edi]
-	jz	NEAR L$006bn_sqr_mont
+	jz	NEAR L$008bn_sqr_mont
 	mov	DWORD [28+esp],eax
 	mov	eax,DWORD [esi]
 	xor	edx,edx
 align	16
-L$007mull:
+L$009mull:
 	mov	ebp,edx
 	mul	edi
 	add	ebp,eax
@@ -197,7 +212,7 @@
 	mov	eax,DWORD [ecx*4+esi]
 	cmp	ecx,ebx
 	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$007mull
+	jl	NEAR L$009mull
 	mov	ebp,edx
 	mul	edi
 	mov	edi,DWORD [20+esp]
@@ -215,9 +230,9 @@
 	mov	eax,DWORD [4+esi]
 	adc	edx,0
 	inc	ecx
-	jmp	NEAR L$0082ndmadd
+	jmp	NEAR L$0102ndmadd
 align	16
-L$0091stmadd:
+L$0111stmadd:
 	mov	ebp,edx
 	mul	edi
 	add	ebp,DWORD [32+ecx*4+esp]
@@ -228,7 +243,7 @@
 	adc	edx,0
 	cmp	ecx,ebx
 	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$0091stmadd
+	jl	NEAR L$0111stmadd
 	mov	ebp,edx
 	mul	edi
 	add	eax,DWORD [32+ebx*4+esp]
@@ -251,7 +266,7 @@
 	adc	edx,0
 	mov	ecx,1
 align	16
-L$0082ndmadd:
+L$0102ndmadd:
 	mov	ebp,edx
 	mul	edi
 	add	ebp,DWORD [32+ecx*4+esp]
@@ -262,7 +277,7 @@
 	adc	edx,0
 	cmp	ecx,ebx
 	mov	DWORD [24+ecx*4+esp],ebp
-	jl	NEAR L$0082ndmadd
+	jl	NEAR L$0102ndmadd
 	mov	ebp,edx
 	mul	edi
 	add	ebp,DWORD [32+ebx*4+esp]
@@ -278,16 +293,16 @@
 	mov	DWORD [32+ebx*4+esp],edx
 	cmp	ecx,DWORD [28+esp]
 	mov	DWORD [36+ebx*4+esp],eax
-	je	NEAR L$005common_tail
+	je	NEAR L$007common_tail
 	mov	edi,DWORD [ecx]
 	mov	esi,DWORD [8+esp]
 	mov	DWORD [12+esp],ecx
 	xor	ecx,ecx
 	xor	edx,edx
 	mov	eax,DWORD [esi]
-	jmp	NEAR L$0091stmadd
+	jmp	NEAR L$0111stmadd
 align	16
-L$006bn_sqr_mont:
+L$008bn_sqr_mont:
 	mov	DWORD [esp],ebx
 	mov	DWORD [12+esp],ecx
 	mov	eax,edi
@@ -298,7 +313,7 @@
 	and	ebx,1
 	inc	ecx
 align	16
-L$010sqr:
+L$012sqr:
 	mov	eax,DWORD [ecx*4+esi]
 	mov	ebp,edx
 	mul	edi
@@ -310,7 +325,7 @@
 	cmp	ecx,DWORD [esp]
 	mov	ebx,eax
 	mov	DWORD [28+ecx*4+esp],ebp
-	jl	NEAR L$010sqr
+	jl	NEAR L$012sqr
 	mov	eax,DWORD [ecx*4+esi]
 	mov	ebp,edx
 	mul	edi
@@ -334,7 +349,7 @@
 	mov	eax,DWORD [4+esi]
 	mov	ecx,1
 align	16
-L$0113rdmadd:
+L$0133rdmadd:
 	mov	ebp,edx
 	mul	edi
 	add	ebp,DWORD [32+ecx*4+esp]
@@ -353,7 +368,7 @@
 	adc	edx,0
 	cmp	ecx,ebx
 	mov	DWORD [24+ecx*4+esp],ebp
-	jl	NEAR L$0113rdmadd
+	jl	NEAR L$0133rdmadd
 	mov	ebp,edx
 	mul	edi
 	add	ebp,DWORD [32+ebx*4+esp]
@@ -369,7 +384,7 @@
 	mov	DWORD [32+ebx*4+esp],edx
 	cmp	ecx,ebx
 	mov	DWORD [36+ebx*4+esp],eax
-	je	NEAR L$005common_tail
+	je	NEAR L$007common_tail
 	mov	edi,DWORD [4+ecx*4+esi]
 	lea	ecx,[1+ecx]
 	mov	eax,edi
@@ -381,12 +396,12 @@
 	xor	ebp,ebp
 	cmp	ecx,ebx
 	lea	ecx,[1+ecx]
-	je	NEAR L$012sqrlast
+	je	NEAR L$014sqrlast
 	mov	ebx,edx
 	shr	edx,1
 	and	ebx,1
 align	16
-L$013sqradd:
+L$015sqradd:
 	mov	eax,DWORD [ecx*4+esi]
 	mov	ebp,edx
 	mul	edi
@@ -402,13 +417,13 @@
 	cmp	ecx,DWORD [esp]
 	mov	DWORD [28+ecx*4+esp],ebp
 	mov	ebx,eax
-	jle	NEAR L$013sqradd
+	jle	NEAR L$015sqradd
 	mov	ebp,edx
 	add	edx,edx
 	shr	ebp,31
 	add	edx,ebx
 	adc	ebp,0
-L$012sqrlast:
+L$014sqrlast:
 	mov	edi,DWORD [20+esp]
 	mov	esi,DWORD [16+esp]
 	imul	edi,DWORD [32+esp]
@@ -423,9 +438,9 @@
 	adc	edx,0
 	mov	ecx,1
 	mov	eax,DWORD [4+esi]
-	jmp	NEAR L$0113rdmadd
+	jmp	NEAR L$0133rdmadd
 align	16
-L$005common_tail:
+L$007common_tail:
 	mov	ebp,DWORD [16+esp]
 	mov	edi,DWORD [4+esp]
 	lea	esi,[32+esp]
@@ -433,25 +448,26 @@
 	mov	ecx,ebx
 	xor	edx,edx
 align	16
-L$014sub:
+L$016sub:
 	sbb	eax,DWORD [edx*4+ebp]
 	mov	DWORD [edx*4+edi],eax
 	dec	ecx
 	mov	eax,DWORD [4+edx*4+esi]
 	lea	edx,[1+edx]
-	jge	NEAR L$014sub
+	jge	NEAR L$016sub
 	sbb	eax,0
+	and	esi,eax
+	not	eax
+	mov	ebp,edi
+	and	ebp,eax
+	or	esi,ebp
 align	16
-L$015copy:
-	mov	edx,DWORD [ebx*4+esi]
-	mov	ebp,DWORD [ebx*4+edi]
-	xor	edx,ebp
-	and	edx,eax
-	xor	edx,ebp
-	mov	DWORD [ebx*4+esi],ecx
-	mov	DWORD [ebx*4+edi],edx
+L$017copy:
+	mov	eax,DWORD [ebx*4+esi]
+	mov	DWORD [ebx*4+edi],eax
+	mov	DWORD [32+ebx*4+esp],ecx
 	dec	ebx
-	jge	NEAR L$015copy
+	jge	NEAR L$017copy
 	mov	esp,DWORD [24+esp]
 	mov	eax,1
 L$000just_leave:
diff --git a/win-x86_64/crypto/aes/aes-x86_64.asm b/win-x86_64/crypto/aes/aes-x86_64.asm
index 53394f0..3db1846 100644
--- a/win-x86_64/crypto/aes/aes-x86_64.asm
+++ b/win-x86_64/crypto/aes/aes-x86_64.asm
@@ -344,6 +344,7 @@
 	mov	rdx,r8
 
 
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
@@ -352,7 +353,6 @@
 	push	r15
 
 
-	mov	r10,rsp
 	lea	rcx,[((-63))+rdx]
 	and	rsp,-64
 	sub	rcx,rsp
@@ -362,7 +362,7 @@
 	sub	rsp,32
 
 	mov	QWORD[16+rsp],rsi
-	mov	QWORD[24+rsp],r10
+	mov	QWORD[24+rsp],rax
 $L$enc_prologue:
 
 	mov	r15,rdx
@@ -394,13 +394,13 @@
 	mov	DWORD[8+r9],ecx
 	mov	DWORD[12+r9],edx
 
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$enc_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -800,6 +800,7 @@
 	mov	rdx,r8
 
 
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
@@ -808,7 +809,6 @@
 	push	r15
 
 
-	mov	r10,rsp
 	lea	rcx,[((-63))+rdx]
 	and	rsp,-64
 	sub	rcx,rsp
@@ -818,7 +818,7 @@
 	sub	rsp,32
 
 	mov	QWORD[16+rsp],rsi
-	mov	QWORD[24+rsp],r10
+	mov	QWORD[24+rsp],rax
 $L$dec_prologue:
 
 	mov	r15,rdx
@@ -852,13 +852,13 @@
 	mov	DWORD[8+r9],ecx
 	mov	DWORD[12+r9],edx
 
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$dec_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -1367,10 +1367,9 @@
 	mov	r9d,r9d
 
 	lea	r14,[$L$AES_Te]
+	lea	r10,[$L$AES_Td]
 	cmp	r9,0
-	jne	NEAR $L$cbc_picked_te
-	lea	r14,[$L$AES_Td]
-$L$cbc_picked_te:
+	cmove	r14,r10
 
 	mov	r10d,DWORD[OPENSSL_ia32cap_P]
 	cmp	rdx,512
@@ -2626,7 +2625,6 @@
 	jae	NEAR $L$in_block_prologue
 
 	mov	rax,QWORD[24+rax]
-	lea	rax,[48+rax]
 
 	mov	rbx,QWORD[((-8))+rax]
 	mov	rbp,QWORD[((-16))+rax]
diff --git a/win-x86_64/crypto/aes/aesni-x86_64.asm b/win-x86_64/crypto/aes/aesni-x86_64.asm
index cf313d1..d5d454d 100644
--- a/win-x86_64/crypto/aes/aesni-x86_64.asm
+++ b/win-x86_64/crypto/aes/aesni-x86_64.asm
@@ -1129,22 +1129,21 @@
 
 ALIGN	16
 $L$ctr32_bulk:
-	lea	rax,[rsp]
+	lea	r11,[rsp]
 	push	rbp
 	sub	rsp,288
 	and	rsp,-16
-	movaps	XMMWORD[(-168)+rax],xmm6
-	movaps	XMMWORD[(-152)+rax],xmm7
-	movaps	XMMWORD[(-136)+rax],xmm8
-	movaps	XMMWORD[(-120)+rax],xmm9
-	movaps	XMMWORD[(-104)+rax],xmm10
-	movaps	XMMWORD[(-88)+rax],xmm11
-	movaps	XMMWORD[(-72)+rax],xmm12
-	movaps	XMMWORD[(-56)+rax],xmm13
-	movaps	XMMWORD[(-40)+rax],xmm14
-	movaps	XMMWORD[(-24)+rax],xmm15
+	movaps	XMMWORD[(-168)+r11],xmm6
+	movaps	XMMWORD[(-152)+r11],xmm7
+	movaps	XMMWORD[(-136)+r11],xmm8
+	movaps	XMMWORD[(-120)+r11],xmm9
+	movaps	XMMWORD[(-104)+r11],xmm10
+	movaps	XMMWORD[(-88)+r11],xmm11
+	movaps	XMMWORD[(-72)+r11],xmm12
+	movaps	XMMWORD[(-56)+r11],xmm13
+	movaps	XMMWORD[(-40)+r11],xmm14
+	movaps	XMMWORD[(-24)+r11],xmm15
 $L$ctr32_body:
-	lea	rbp,[((-8))+rax]
 
 
 
@@ -1153,7 +1152,7 @@
 	movdqu	xmm0,XMMWORD[rcx]
 	mov	r8d,DWORD[12+r8]
 	pxor	xmm2,xmm0
-	mov	r11d,DWORD[12+rcx]
+	mov	ebp,DWORD[12+rcx]
 	movdqa	XMMWORD[rsp],xmm2
 	bswap	r8d
 	movdqa	xmm3,xmm2
@@ -1169,8 +1168,8 @@
 	lea	rdx,[2+r8]
 	bswap	eax
 	bswap	edx
-	xor	eax,r11d
-	xor	edx,r11d
+	xor	eax,ebp
+	xor	edx,ebp
 DB	102,15,58,34,216,3
 	lea	rax,[3+r8]
 	movdqa	XMMWORD[16+rsp],xmm3
@@ -1179,25 +1178,25 @@
 	mov	rdx,r10
 	lea	r10,[4+r8]
 	movdqa	XMMWORD[32+rsp],xmm4
-	xor	eax,r11d
+	xor	eax,ebp
 	bswap	r10d
 DB	102,15,58,34,232,3
-	xor	r10d,r11d
+	xor	r10d,ebp
 	movdqa	XMMWORD[48+rsp],xmm5
 	lea	r9,[5+r8]
 	mov	DWORD[((64+12))+rsp],r10d
 	bswap	r9d
 	lea	r10,[6+r8]
 	mov	eax,DWORD[240+rcx]
-	xor	r9d,r11d
+	xor	r9d,ebp
 	bswap	r10d
 	mov	DWORD[((80+12))+rsp],r9d
-	xor	r10d,r11d
+	xor	r10d,ebp
 	lea	r9,[7+r8]
 	mov	DWORD[((96+12))+rsp],r10d
 	bswap	r9d
 	mov	r10d,DWORD[((OPENSSL_ia32cap_P+4))]
-	xor	r9d,r11d
+	xor	r9d,ebp
 	and	r10d,71303168
 	mov	DWORD[((112+12))+rsp],r9d
 
@@ -1221,7 +1220,7 @@
 $L$ctr32_6x:
 	shl	eax,4
 	mov	r10d,48
-	bswap	r11d
+	bswap	ebp
 	lea	rcx,[32+rax*1+rcx]
 	sub	r10,rax
 	jmp	NEAR $L$ctr32_loop6
@@ -1232,32 +1231,32 @@
 	movups	xmm0,XMMWORD[((-48))+r10*1+rcx]
 DB	102,15,56,220,209
 	mov	eax,r8d
-	xor	eax,r11d
+	xor	eax,ebp
 DB	102,15,56,220,217
 DB	0x0f,0x38,0xf1,0x44,0x24,12
 	lea	eax,[1+r8]
 DB	102,15,56,220,225
-	xor	eax,r11d
+	xor	eax,ebp
 DB	0x0f,0x38,0xf1,0x44,0x24,28
 DB	102,15,56,220,233
 	lea	eax,[2+r8]
-	xor	eax,r11d
+	xor	eax,ebp
 DB	102,15,56,220,241
 DB	0x0f,0x38,0xf1,0x44,0x24,44
 	lea	eax,[3+r8]
 DB	102,15,56,220,249
 	movups	xmm1,XMMWORD[((-32))+r10*1+rcx]
-	xor	eax,r11d
+	xor	eax,ebp
 
 DB	102,15,56,220,208
 DB	0x0f,0x38,0xf1,0x44,0x24,60
 	lea	eax,[4+r8]
 DB	102,15,56,220,216
-	xor	eax,r11d
+	xor	eax,ebp
 DB	0x0f,0x38,0xf1,0x44,0x24,76
 DB	102,15,56,220,224
 	lea	eax,[5+r8]
-	xor	eax,r11d
+	xor	eax,ebp
 DB	102,15,56,220,232
 DB	0x0f,0x38,0xf1,0x44,0x24,92
 	mov	rax,r10
@@ -1318,7 +1317,7 @@
 	bswap	r9d
 	movups	xmm0,XMMWORD[((32-128))+rcx]
 DB	102,15,56,220,225
-	xor	r9d,r11d
+	xor	r9d,ebp
 	nop
 DB	102,15,56,220,233
 	mov	DWORD[((0+12))+rsp],r9d
@@ -1331,7 +1330,7 @@
 	bswap	r9d
 DB	102,15,56,220,208
 DB	102,15,56,220,216
-	xor	r9d,r11d
+	xor	r9d,ebp
 DB	0x66,0x90
 DB	102,15,56,220,224
 DB	102,15,56,220,232
@@ -1345,7 +1344,7 @@
 	bswap	r9d
 DB	102,15,56,220,209
 DB	102,15,56,220,217
-	xor	r9d,r11d
+	xor	r9d,ebp
 DB	0x66,0x90
 DB	102,15,56,220,225
 DB	102,15,56,220,233
@@ -1359,7 +1358,7 @@
 	bswap	r9d
 DB	102,15,56,220,208
 DB	102,15,56,220,216
-	xor	r9d,r11d
+	xor	r9d,ebp
 DB	0x66,0x90
 DB	102,15,56,220,224
 DB	102,15,56,220,232
@@ -1373,7 +1372,7 @@
 	bswap	r9d
 DB	102,15,56,220,209
 DB	102,15,56,220,217
-	xor	r9d,r11d
+	xor	r9d,ebp
 DB	0x66,0x90
 DB	102,15,56,220,225
 DB	102,15,56,220,233
@@ -1387,7 +1386,7 @@
 	bswap	r9d
 DB	102,15,56,220,208
 DB	102,15,56,220,216
-	xor	r9d,r11d
+	xor	r9d,ebp
 DB	0x66,0x90
 DB	102,15,56,220,224
 DB	102,15,56,220,232
@@ -1401,7 +1400,7 @@
 	bswap	r9d
 DB	102,15,56,220,209
 DB	102,15,56,220,217
-	xor	r9d,r11d
+	xor	r9d,ebp
 DB	0x66,0x90
 DB	102,15,56,220,225
 DB	102,15,56,220,233
@@ -1416,7 +1415,7 @@
 DB	102,15,56,220,208
 DB	102,15,56,220,216
 DB	102,15,56,220,224
-	xor	r9d,r11d
+	xor	r9d,ebp
 	movdqu	xmm10,XMMWORD[rdi]
 DB	102,15,56,220,232
 	mov	DWORD[((112+12))+rsp],r9d
@@ -1651,32 +1650,32 @@
 
 $L$ctr32_done:
 	xorps	xmm0,xmm0
-	xor	r11d,r11d
+	xor	ebp,ebp
 	pxor	xmm1,xmm1
 	pxor	xmm2,xmm2
 	pxor	xmm3,xmm3
 	pxor	xmm4,xmm4
 	pxor	xmm5,xmm5
-	movaps	xmm6,XMMWORD[((-160))+rbp]
-	movaps	XMMWORD[(-160)+rbp],xmm0
-	movaps	xmm7,XMMWORD[((-144))+rbp]
-	movaps	XMMWORD[(-144)+rbp],xmm0
-	movaps	xmm8,XMMWORD[((-128))+rbp]
-	movaps	XMMWORD[(-128)+rbp],xmm0
-	movaps	xmm9,XMMWORD[((-112))+rbp]
-	movaps	XMMWORD[(-112)+rbp],xmm0
-	movaps	xmm10,XMMWORD[((-96))+rbp]
-	movaps	XMMWORD[(-96)+rbp],xmm0
-	movaps	xmm11,XMMWORD[((-80))+rbp]
-	movaps	XMMWORD[(-80)+rbp],xmm0
-	movaps	xmm12,XMMWORD[((-64))+rbp]
-	movaps	XMMWORD[(-64)+rbp],xmm0
-	movaps	xmm13,XMMWORD[((-48))+rbp]
-	movaps	XMMWORD[(-48)+rbp],xmm0
-	movaps	xmm14,XMMWORD[((-32))+rbp]
-	movaps	XMMWORD[(-32)+rbp],xmm0
-	movaps	xmm15,XMMWORD[((-16))+rbp]
-	movaps	XMMWORD[(-16)+rbp],xmm0
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	XMMWORD[(-168)+r11],xmm0
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	XMMWORD[(-152)+r11],xmm0
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	XMMWORD[(-136)+r11],xmm0
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	XMMWORD[(-120)+r11],xmm0
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	XMMWORD[(-104)+r11],xmm0
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	XMMWORD[(-88)+r11],xmm0
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	XMMWORD[(-72)+r11],xmm0
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	XMMWORD[(-56)+r11],xmm0
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	XMMWORD[(-40)+r11],xmm0
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	movaps	XMMWORD[(-24)+r11],xmm0
 	movaps	XMMWORD[rsp],xmm0
 	movaps	XMMWORD[16+rsp],xmm0
 	movaps	XMMWORD[32+rsp],xmm0
@@ -1685,8 +1684,8 @@
 	movaps	XMMWORD[80+rsp],xmm0
 	movaps	XMMWORD[96+rsp],xmm0
 	movaps	XMMWORD[112+rsp],xmm0
-	lea	rsp,[rbp]
-	pop	rbp
+	mov	rbp,QWORD[((-8))+r11]
+	lea	rsp,[r11]
 $L$ctr32_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -1708,22 +1707,21 @@
 	mov	r9,QWORD[48+rsp]
 
 
-	lea	rax,[rsp]
+	lea	r11,[rsp]
 	push	rbp
 	sub	rsp,272
 	and	rsp,-16
-	movaps	XMMWORD[(-168)+rax],xmm6
-	movaps	XMMWORD[(-152)+rax],xmm7
-	movaps	XMMWORD[(-136)+rax],xmm8
-	movaps	XMMWORD[(-120)+rax],xmm9
-	movaps	XMMWORD[(-104)+rax],xmm10
-	movaps	XMMWORD[(-88)+rax],xmm11
-	movaps	XMMWORD[(-72)+rax],xmm12
-	movaps	XMMWORD[(-56)+rax],xmm13
-	movaps	XMMWORD[(-40)+rax],xmm14
-	movaps	XMMWORD[(-24)+rax],xmm15
+	movaps	XMMWORD[(-168)+r11],xmm6
+	movaps	XMMWORD[(-152)+r11],xmm7
+	movaps	XMMWORD[(-136)+r11],xmm8
+	movaps	XMMWORD[(-120)+r11],xmm9
+	movaps	XMMWORD[(-104)+r11],xmm10
+	movaps	XMMWORD[(-88)+r11],xmm11
+	movaps	XMMWORD[(-72)+r11],xmm12
+	movaps	XMMWORD[(-56)+r11],xmm13
+	movaps	XMMWORD[(-40)+r11],xmm14
+	movaps	XMMWORD[(-24)+r11],xmm15
 $L$xts_enc_body:
-	lea	rbp,[((-8))+rax]
 	movups	xmm2,XMMWORD[r9]
 	mov	eax,DWORD[240+r8]
 	mov	r10d,DWORD[240+rcx]
@@ -1739,7 +1737,7 @@
 	jnz	NEAR $L$oop_enc1_8
 DB	102,15,56,221,209
 	movups	xmm0,XMMWORD[rcx]
-	mov	r11,rcx
+	mov	rbp,rcx
 	mov	eax,r10d
 	shl	r10d,4
 	mov	r9,rdx
@@ -1795,9 +1793,9 @@
 	jc	NEAR $L$xts_enc_short
 
 	mov	eax,16+96
-	lea	rcx,[32+r10*1+r11]
+	lea	rcx,[32+r10*1+rbp]
 	sub	rax,r10
-	movups	xmm1,XMMWORD[16+r11]
+	movups	xmm1,XMMWORD[16+rbp]
 	mov	r10,rax
 	lea	r8,[$L$xts_magic]
 	jmp	NEAR $L$xts_enc_grandloop
@@ -1822,7 +1820,7 @@
 	movdqa	xmm9,XMMWORD[96+rsp]
 	pxor	xmm6,xmm14
 DB	102,15,56,220,233
-	movups	xmm0,XMMWORD[32+r11]
+	movups	xmm0,XMMWORD[32+rbp]
 	lea	rdi,[96+rdi]
 	pxor	xmm7,xmm8
 
@@ -1831,7 +1829,7 @@
 	pxor	xmm11,xmm9
 	movdqa	XMMWORD[rsp],xmm10
 DB	102,15,56,220,249
-	movups	xmm1,XMMWORD[48+r11]
+	movups	xmm1,XMMWORD[48+rbp]
 	pxor	xmm12,xmm9
 
 DB	102,15,56,220,208
@@ -1846,7 +1844,7 @@
 	movdqa	XMMWORD[64+rsp],xmm14
 DB	102,15,56,220,240
 DB	102,15,56,220,248
-	movups	xmm0,XMMWORD[64+r11]
+	movups	xmm0,XMMWORD[64+rbp]
 	movdqa	XMMWORD[80+rsp],xmm8
 	pshufd	xmm9,xmm15,0x5f
 	jmp	NEAR $L$xts_enc_loop6
@@ -1878,7 +1876,7 @@
 	psrad	xmm14,31
 DB	102,15,56,220,217
 	pand	xmm14,xmm8
-	movups	xmm10,XMMWORD[r11]
+	movups	xmm10,XMMWORD[rbp]
 DB	102,15,56,220,225
 DB	102,15,56,220,233
 DB	102,15,56,220,241
@@ -1946,10 +1944,10 @@
 DB	102,15,56,220,225
 DB	102,15,56,220,233
 	pxor	xmm15,xmm0
-	movups	xmm0,XMMWORD[r11]
+	movups	xmm0,XMMWORD[rbp]
 DB	102,15,56,220,241
 DB	102,15,56,220,249
-	movups	xmm1,XMMWORD[16+r11]
+	movups	xmm1,XMMWORD[16+rbp]
 
 	pxor	xmm14,xmm15
 DB	102,15,56,221,84,36,0
@@ -1976,7 +1974,7 @@
 
 	mov	eax,16+96
 	sub	eax,r10d
-	mov	rcx,r11
+	mov	rcx,rbp
 	shr	eax,4
 
 $L$xts_enc_short:
@@ -2132,7 +2130,7 @@
 	jnz	NEAR $L$xts_enc_steal
 
 	sub	rsi,r9
-	mov	rcx,r11
+	mov	rcx,rbp
 	mov	eax,r10d
 
 	movups	xmm2,XMMWORD[((-16))+rsi]
@@ -2158,26 +2156,26 @@
 	pxor	xmm3,xmm3
 	pxor	xmm4,xmm4
 	pxor	xmm5,xmm5
-	movaps	xmm6,XMMWORD[((-160))+rbp]
-	movaps	XMMWORD[(-160)+rbp],xmm0
-	movaps	xmm7,XMMWORD[((-144))+rbp]
-	movaps	XMMWORD[(-144)+rbp],xmm0
-	movaps	xmm8,XMMWORD[((-128))+rbp]
-	movaps	XMMWORD[(-128)+rbp],xmm0
-	movaps	xmm9,XMMWORD[((-112))+rbp]
-	movaps	XMMWORD[(-112)+rbp],xmm0
-	movaps	xmm10,XMMWORD[((-96))+rbp]
-	movaps	XMMWORD[(-96)+rbp],xmm0
-	movaps	xmm11,XMMWORD[((-80))+rbp]
-	movaps	XMMWORD[(-80)+rbp],xmm0
-	movaps	xmm12,XMMWORD[((-64))+rbp]
-	movaps	XMMWORD[(-64)+rbp],xmm0
-	movaps	xmm13,XMMWORD[((-48))+rbp]
-	movaps	XMMWORD[(-48)+rbp],xmm0
-	movaps	xmm14,XMMWORD[((-32))+rbp]
-	movaps	XMMWORD[(-32)+rbp],xmm0
-	movaps	xmm15,XMMWORD[((-16))+rbp]
-	movaps	XMMWORD[(-16)+rbp],xmm0
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	XMMWORD[(-168)+r11],xmm0
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	XMMWORD[(-152)+r11],xmm0
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	XMMWORD[(-136)+r11],xmm0
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	XMMWORD[(-120)+r11],xmm0
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	XMMWORD[(-104)+r11],xmm0
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	XMMWORD[(-88)+r11],xmm0
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	XMMWORD[(-72)+r11],xmm0
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	XMMWORD[(-56)+r11],xmm0
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	XMMWORD[(-40)+r11],xmm0
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	movaps	XMMWORD[(-24)+r11],xmm0
 	movaps	XMMWORD[rsp],xmm0
 	movaps	XMMWORD[16+rsp],xmm0
 	movaps	XMMWORD[32+rsp],xmm0
@@ -2185,8 +2183,8 @@
 	movaps	XMMWORD[64+rsp],xmm0
 	movaps	XMMWORD[80+rsp],xmm0
 	movaps	XMMWORD[96+rsp],xmm0
-	lea	rsp,[rbp]
-	pop	rbp
+	mov	rbp,QWORD[((-8))+r11]
+	lea	rsp,[r11]
 $L$xts_enc_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -2208,22 +2206,21 @@
 	mov	r9,QWORD[48+rsp]
 
 
-	lea	rax,[rsp]
+	lea	r11,[rsp]
 	push	rbp
 	sub	rsp,272
 	and	rsp,-16
-	movaps	XMMWORD[(-168)+rax],xmm6
-	movaps	XMMWORD[(-152)+rax],xmm7
-	movaps	XMMWORD[(-136)+rax],xmm8
-	movaps	XMMWORD[(-120)+rax],xmm9
-	movaps	XMMWORD[(-104)+rax],xmm10
-	movaps	XMMWORD[(-88)+rax],xmm11
-	movaps	XMMWORD[(-72)+rax],xmm12
-	movaps	XMMWORD[(-56)+rax],xmm13
-	movaps	XMMWORD[(-40)+rax],xmm14
-	movaps	XMMWORD[(-24)+rax],xmm15
+	movaps	XMMWORD[(-168)+r11],xmm6
+	movaps	XMMWORD[(-152)+r11],xmm7
+	movaps	XMMWORD[(-136)+r11],xmm8
+	movaps	XMMWORD[(-120)+r11],xmm9
+	movaps	XMMWORD[(-104)+r11],xmm10
+	movaps	XMMWORD[(-88)+r11],xmm11
+	movaps	XMMWORD[(-72)+r11],xmm12
+	movaps	XMMWORD[(-56)+r11],xmm13
+	movaps	XMMWORD[(-40)+r11],xmm14
+	movaps	XMMWORD[(-24)+r11],xmm15
 $L$xts_dec_body:
-	lea	rbp,[((-8))+rax]
 	movups	xmm2,XMMWORD[r9]
 	mov	eax,DWORD[240+r8]
 	mov	r10d,DWORD[240+rcx]
@@ -2245,7 +2242,7 @@
 	sub	rdx,rax
 
 	movups	xmm0,XMMWORD[rcx]
-	mov	r11,rcx
+	mov	rbp,rcx
 	mov	eax,r10d
 	shl	r10d,4
 	mov	r9,rdx
@@ -2301,9 +2298,9 @@
 	jc	NEAR $L$xts_dec_short
 
 	mov	eax,16+96
-	lea	rcx,[32+r10*1+r11]
+	lea	rcx,[32+r10*1+rbp]
 	sub	rax,r10
-	movups	xmm1,XMMWORD[16+r11]
+	movups	xmm1,XMMWORD[16+rbp]
 	mov	r10,rax
 	lea	r8,[$L$xts_magic]
 	jmp	NEAR $L$xts_dec_grandloop
@@ -2328,7 +2325,7 @@
 	movdqa	xmm9,XMMWORD[96+rsp]
 	pxor	xmm6,xmm14
 DB	102,15,56,222,233
-	movups	xmm0,XMMWORD[32+r11]
+	movups	xmm0,XMMWORD[32+rbp]
 	lea	rdi,[96+rdi]
 	pxor	xmm7,xmm8
 
@@ -2337,7 +2334,7 @@
 	pxor	xmm11,xmm9
 	movdqa	XMMWORD[rsp],xmm10
 DB	102,15,56,222,249
-	movups	xmm1,XMMWORD[48+r11]
+	movups	xmm1,XMMWORD[48+rbp]
 	pxor	xmm12,xmm9
 
 DB	102,15,56,222,208
@@ -2352,7 +2349,7 @@
 	movdqa	XMMWORD[64+rsp],xmm14
 DB	102,15,56,222,240
 DB	102,15,56,222,248
-	movups	xmm0,XMMWORD[64+r11]
+	movups	xmm0,XMMWORD[64+rbp]
 	movdqa	XMMWORD[80+rsp],xmm8
 	pshufd	xmm9,xmm15,0x5f
 	jmp	NEAR $L$xts_dec_loop6
@@ -2384,7 +2381,7 @@
 	psrad	xmm14,31
 DB	102,15,56,222,217
 	pand	xmm14,xmm8
-	movups	xmm10,XMMWORD[r11]
+	movups	xmm10,XMMWORD[rbp]
 DB	102,15,56,222,225
 DB	102,15,56,222,233
 DB	102,15,56,222,241
@@ -2452,10 +2449,10 @@
 DB	102,15,56,222,225
 DB	102,15,56,222,233
 	pxor	xmm15,xmm0
-	movups	xmm0,XMMWORD[r11]
+	movups	xmm0,XMMWORD[rbp]
 DB	102,15,56,222,241
 DB	102,15,56,222,249
-	movups	xmm1,XMMWORD[16+r11]
+	movups	xmm1,XMMWORD[16+rbp]
 
 	pxor	xmm14,xmm15
 DB	102,15,56,223,84,36,0
@@ -2482,7 +2479,7 @@
 
 	mov	eax,16+96
 	sub	eax,r10d
-	mov	rcx,r11
+	mov	rcx,rbp
 	shr	eax,4
 
 $L$xts_dec_short:
@@ -2639,7 +2636,7 @@
 	jz	NEAR $L$xts_dec_ret
 $L$xts_dec_done2:
 	mov	rdx,r9
-	mov	rcx,r11
+	mov	rcx,rbp
 	mov	eax,r10d
 
 	movups	xmm2,XMMWORD[rdi]
@@ -2669,7 +2666,7 @@
 	jnz	NEAR $L$xts_dec_steal
 
 	sub	rsi,r9
-	mov	rcx,r11
+	mov	rcx,rbp
 	mov	eax,r10d
 
 	movups	xmm2,XMMWORD[rsi]
@@ -2695,26 +2692,26 @@
 	pxor	xmm3,xmm3
 	pxor	xmm4,xmm4
 	pxor	xmm5,xmm5
-	movaps	xmm6,XMMWORD[((-160))+rbp]
-	movaps	XMMWORD[(-160)+rbp],xmm0
-	movaps	xmm7,XMMWORD[((-144))+rbp]
-	movaps	XMMWORD[(-144)+rbp],xmm0
-	movaps	xmm8,XMMWORD[((-128))+rbp]
-	movaps	XMMWORD[(-128)+rbp],xmm0
-	movaps	xmm9,XMMWORD[((-112))+rbp]
-	movaps	XMMWORD[(-112)+rbp],xmm0
-	movaps	xmm10,XMMWORD[((-96))+rbp]
-	movaps	XMMWORD[(-96)+rbp],xmm0
-	movaps	xmm11,XMMWORD[((-80))+rbp]
-	movaps	XMMWORD[(-80)+rbp],xmm0
-	movaps	xmm12,XMMWORD[((-64))+rbp]
-	movaps	XMMWORD[(-64)+rbp],xmm0
-	movaps	xmm13,XMMWORD[((-48))+rbp]
-	movaps	XMMWORD[(-48)+rbp],xmm0
-	movaps	xmm14,XMMWORD[((-32))+rbp]
-	movaps	XMMWORD[(-32)+rbp],xmm0
-	movaps	xmm15,XMMWORD[((-16))+rbp]
-	movaps	XMMWORD[(-16)+rbp],xmm0
+	movaps	xmm6,XMMWORD[((-168))+r11]
+	movaps	XMMWORD[(-168)+r11],xmm0
+	movaps	xmm7,XMMWORD[((-152))+r11]
+	movaps	XMMWORD[(-152)+r11],xmm0
+	movaps	xmm8,XMMWORD[((-136))+r11]
+	movaps	XMMWORD[(-136)+r11],xmm0
+	movaps	xmm9,XMMWORD[((-120))+r11]
+	movaps	XMMWORD[(-120)+r11],xmm0
+	movaps	xmm10,XMMWORD[((-104))+r11]
+	movaps	XMMWORD[(-104)+r11],xmm0
+	movaps	xmm11,XMMWORD[((-88))+r11]
+	movaps	XMMWORD[(-88)+r11],xmm0
+	movaps	xmm12,XMMWORD[((-72))+r11]
+	movaps	XMMWORD[(-72)+r11],xmm0
+	movaps	xmm13,XMMWORD[((-56))+r11]
+	movaps	XMMWORD[(-56)+r11],xmm0
+	movaps	xmm14,XMMWORD[((-40))+r11]
+	movaps	XMMWORD[(-40)+r11],xmm0
+	movaps	xmm15,XMMWORD[((-24))+r11]
+	movaps	XMMWORD[(-24)+r11],xmm0
 	movaps	XMMWORD[rsp],xmm0
 	movaps	XMMWORD[16+rsp],xmm0
 	movaps	XMMWORD[32+rsp],xmm0
@@ -2722,13 +2719,901 @@
 	movaps	XMMWORD[64+rsp],xmm0
 	movaps	XMMWORD[80+rsp],xmm0
 	movaps	XMMWORD[96+rsp],xmm0
-	lea	rsp,[rbp]
-	pop	rbp
+	mov	rbp,QWORD[((-8))+r11]
+	lea	rsp,[r11]
 $L$xts_dec_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
 $L$SEH_end_aesni_xts_decrypt:
+global	aesni_ocb_encrypt
+
+ALIGN	32
+aesni_ocb_encrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesni_ocb_encrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+	lea	rax,[rsp]
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	lea	rsp,[((-160))+rsp]
+	movaps	XMMWORD[rsp],xmm6
+	movaps	XMMWORD[16+rsp],xmm7
+	movaps	XMMWORD[32+rsp],xmm8
+	movaps	XMMWORD[48+rsp],xmm9
+	movaps	XMMWORD[64+rsp],xmm10
+	movaps	XMMWORD[80+rsp],xmm11
+	movaps	XMMWORD[96+rsp],xmm12
+	movaps	XMMWORD[112+rsp],xmm13
+	movaps	XMMWORD[128+rsp],xmm14
+	movaps	XMMWORD[144+rsp],xmm15
+$L$ocb_enc_body:
+	mov	rbx,QWORD[56+rax]
+	mov	rbp,QWORD[((56+8))+rax]
+
+	mov	r10d,DWORD[240+rcx]
+	mov	r11,rcx
+	shl	r10d,4
+	movups	xmm9,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+r10*1+rcx]
+
+	movdqu	xmm15,XMMWORD[r9]
+	pxor	xmm9,xmm1
+	pxor	xmm15,xmm1
+
+	mov	eax,16+32
+	lea	rcx,[32+r10*1+r11]
+	movups	xmm1,XMMWORD[16+r11]
+	sub	rax,r10
+	mov	r10,rax
+
+	movdqu	xmm10,XMMWORD[rbx]
+	movdqu	xmm8,XMMWORD[rbp]
+
+	test	r8,1
+	jnz	NEAR $L$ocb_enc_odd
+
+	bsf	r12,r8
+	add	r8,1
+	shl	r12,4
+	movdqu	xmm7,XMMWORD[r12*1+rbx]
+	movdqu	xmm2,XMMWORD[rdi]
+	lea	rdi,[16+rdi]
+
+	call	__ocb_encrypt1
+
+	movdqa	xmm15,xmm7
+	movups	XMMWORD[rsi],xmm2
+	lea	rsi,[16+rsi]
+	sub	rdx,1
+	jz	NEAR $L$ocb_enc_done
+
+$L$ocb_enc_odd:
+	lea	r12,[1+r8]
+	lea	r13,[3+r8]
+	lea	r14,[5+r8]
+	lea	r8,[6+r8]
+	bsf	r12,r12
+	bsf	r13,r13
+	bsf	r14,r14
+	shl	r12,4
+	shl	r13,4
+	shl	r14,4
+
+	sub	rdx,6
+	jc	NEAR $L$ocb_enc_short
+	jmp	NEAR $L$ocb_enc_grandloop
+
+ALIGN	32
+$L$ocb_enc_grandloop:
+	movdqu	xmm2,XMMWORD[rdi]
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movdqu	xmm7,XMMWORD[80+rdi]
+	lea	rdi,[96+rdi]
+
+	call	__ocb_encrypt6
+
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	lea	rsi,[96+rsi]
+	sub	rdx,6
+	jnc	NEAR $L$ocb_enc_grandloop
+
+$L$ocb_enc_short:
+	add	rdx,6
+	jz	NEAR $L$ocb_enc_done
+
+	movdqu	xmm2,XMMWORD[rdi]
+	cmp	rdx,2
+	jb	NEAR $L$ocb_enc_one
+	movdqu	xmm3,XMMWORD[16+rdi]
+	je	NEAR $L$ocb_enc_two
+
+	movdqu	xmm4,XMMWORD[32+rdi]
+	cmp	rdx,4
+	jb	NEAR $L$ocb_enc_three
+	movdqu	xmm5,XMMWORD[48+rdi]
+	je	NEAR $L$ocb_enc_four
+
+	movdqu	xmm6,XMMWORD[64+rdi]
+	pxor	xmm7,xmm7
+
+	call	__ocb_encrypt6
+
+	movdqa	xmm15,xmm14
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	movups	XMMWORD[64+rsi],xmm6
+
+	jmp	NEAR $L$ocb_enc_done
+
+ALIGN	16
+$L$ocb_enc_one:
+	movdqa	xmm7,xmm10
+
+	call	__ocb_encrypt1
+
+	movdqa	xmm15,xmm7
+	movups	XMMWORD[rsi],xmm2
+	jmp	NEAR $L$ocb_enc_done
+
+ALIGN	16
+$L$ocb_enc_two:
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+
+	call	__ocb_encrypt4
+
+	movdqa	xmm15,xmm11
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+
+	jmp	NEAR $L$ocb_enc_done
+
+ALIGN	16
+$L$ocb_enc_three:
+	pxor	xmm5,xmm5
+
+	call	__ocb_encrypt4
+
+	movdqa	xmm15,xmm12
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+
+	jmp	NEAR $L$ocb_enc_done
+
+ALIGN	16
+$L$ocb_enc_four:
+	call	__ocb_encrypt4
+
+	movdqa	xmm15,xmm13
+	movups	XMMWORD[rsi],xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	movups	XMMWORD[48+rsi],xmm5
+
+$L$ocb_enc_done:
+	pxor	xmm15,xmm0
+	movdqu	XMMWORD[rbp],xmm8
+	movdqu	XMMWORD[r9],xmm15
+
+	xorps	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	XMMWORD[rsp],xmm0
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	XMMWORD[48+rsp],xmm0
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	XMMWORD[64+rsp],xmm0
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	XMMWORD[80+rsp],xmm0
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	XMMWORD[96+rsp],xmm0
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	XMMWORD[112+rsp],xmm0
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	XMMWORD[128+rsp],xmm0
+	movaps	xmm15,XMMWORD[144+rsp]
+	movaps	XMMWORD[144+rsp],xmm0
+	lea	rax,[((160+40))+rsp]
+$L$ocb_enc_pop:
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	lea	rsp,[rax]
+$L$ocb_enc_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_aesni_ocb_encrypt:
+
+
+ALIGN	32
+__ocb_encrypt6:
+	pxor	xmm15,xmm9
+	movdqu	xmm11,XMMWORD[r12*1+rbx]
+	movdqa	xmm12,xmm10
+	movdqu	xmm13,XMMWORD[r13*1+rbx]
+	movdqa	xmm14,xmm10
+	pxor	xmm10,xmm15
+	movdqu	xmm15,XMMWORD[r14*1+rbx]
+	pxor	xmm11,xmm10
+	pxor	xmm8,xmm2
+	pxor	xmm2,xmm10
+	pxor	xmm12,xmm11
+	pxor	xmm8,xmm3
+	pxor	xmm3,xmm11
+	pxor	xmm13,xmm12
+	pxor	xmm8,xmm4
+	pxor	xmm4,xmm12
+	pxor	xmm14,xmm13
+	pxor	xmm8,xmm5
+	pxor	xmm5,xmm13
+	pxor	xmm15,xmm14
+	pxor	xmm8,xmm6
+	pxor	xmm6,xmm14
+	pxor	xmm8,xmm7
+	pxor	xmm7,xmm15
+	movups	xmm0,XMMWORD[32+r11]
+
+	lea	r12,[1+r8]
+	lea	r13,[3+r8]
+	lea	r14,[5+r8]
+	add	r8,6
+	pxor	xmm10,xmm9
+	bsf	r12,r12
+	bsf	r13,r13
+	bsf	r14,r14
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	pxor	xmm11,xmm9
+	pxor	xmm12,xmm9
+DB	102,15,56,220,241
+	pxor	xmm13,xmm9
+	pxor	xmm14,xmm9
+DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[48+r11]
+	pxor	xmm15,xmm9
+
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+	movups	xmm0,XMMWORD[64+r11]
+	shl	r12,4
+	shl	r13,4
+	jmp	NEAR $L$ocb_enc_loop6
+
+ALIGN	32
+$L$ocb_enc_loop6:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+DB	102,15,56,220,240
+DB	102,15,56,220,248
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$ocb_enc_loop6
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+DB	102,15,56,220,241
+DB	102,15,56,220,249
+	movups	xmm1,XMMWORD[16+r11]
+	shl	r14,4
+
+DB	102,65,15,56,221,210
+	movdqu	xmm10,XMMWORD[rbx]
+	mov	rax,r10
+DB	102,65,15,56,221,219
+DB	102,65,15,56,221,228
+DB	102,65,15,56,221,237
+DB	102,65,15,56,221,246
+DB	102,65,15,56,221,255
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+__ocb_encrypt4:
+	pxor	xmm15,xmm9
+	movdqu	xmm11,XMMWORD[r12*1+rbx]
+	movdqa	xmm12,xmm10
+	movdqu	xmm13,XMMWORD[r13*1+rbx]
+	pxor	xmm10,xmm15
+	pxor	xmm11,xmm10
+	pxor	xmm8,xmm2
+	pxor	xmm2,xmm10
+	pxor	xmm12,xmm11
+	pxor	xmm8,xmm3
+	pxor	xmm3,xmm11
+	pxor	xmm13,xmm12
+	pxor	xmm8,xmm4
+	pxor	xmm4,xmm12
+	pxor	xmm8,xmm5
+	pxor	xmm5,xmm13
+	movups	xmm0,XMMWORD[32+r11]
+
+	pxor	xmm10,xmm9
+	pxor	xmm11,xmm9
+	pxor	xmm12,xmm9
+	pxor	xmm13,xmm9
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[48+r11]
+
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+	movups	xmm0,XMMWORD[64+r11]
+	jmp	NEAR $L$ocb_enc_loop4
+
+ALIGN	32
+$L$ocb_enc_loop4:
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+
+DB	102,15,56,220,208
+DB	102,15,56,220,216
+DB	102,15,56,220,224
+DB	102,15,56,220,232
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$ocb_enc_loop4
+
+DB	102,15,56,220,209
+DB	102,15,56,220,217
+DB	102,15,56,220,225
+DB	102,15,56,220,233
+	movups	xmm1,XMMWORD[16+r11]
+	mov	rax,r10
+
+DB	102,65,15,56,221,210
+DB	102,65,15,56,221,219
+DB	102,65,15,56,221,228
+DB	102,65,15,56,221,237
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+__ocb_encrypt1:
+	pxor	xmm7,xmm15
+	pxor	xmm7,xmm9
+	pxor	xmm8,xmm2
+	pxor	xmm2,xmm7
+	movups	xmm0,XMMWORD[32+r11]
+
+DB	102,15,56,220,209
+	movups	xmm1,XMMWORD[48+r11]
+	pxor	xmm7,xmm9
+
+DB	102,15,56,220,208
+	movups	xmm0,XMMWORD[64+r11]
+	jmp	NEAR $L$ocb_enc_loop1
+
+ALIGN	32
+$L$ocb_enc_loop1:
+DB	102,15,56,220,209
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+
+DB	102,15,56,220,208
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$ocb_enc_loop1
+
+DB	102,15,56,220,209
+	movups	xmm1,XMMWORD[16+r11]
+	mov	rax,r10
+
+DB	102,15,56,221,215
+	DB	0F3h,0C3h		;repret
+
+
+global	aesni_ocb_decrypt
+
+ALIGN	32
+aesni_ocb_decrypt:
+	mov	QWORD[8+rsp],rdi	;WIN64 prologue
+	mov	QWORD[16+rsp],rsi
+	mov	rax,rsp
+$L$SEH_begin_aesni_ocb_decrypt:
+	mov	rdi,rcx
+	mov	rsi,rdx
+	mov	rdx,r8
+	mov	rcx,r9
+	mov	r8,QWORD[40+rsp]
+	mov	r9,QWORD[48+rsp]
+
+
+	lea	rax,[rsp]
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	lea	rsp,[((-160))+rsp]
+	movaps	XMMWORD[rsp],xmm6
+	movaps	XMMWORD[16+rsp],xmm7
+	movaps	XMMWORD[32+rsp],xmm8
+	movaps	XMMWORD[48+rsp],xmm9
+	movaps	XMMWORD[64+rsp],xmm10
+	movaps	XMMWORD[80+rsp],xmm11
+	movaps	XMMWORD[96+rsp],xmm12
+	movaps	XMMWORD[112+rsp],xmm13
+	movaps	XMMWORD[128+rsp],xmm14
+	movaps	XMMWORD[144+rsp],xmm15
+$L$ocb_dec_body:
+	mov	rbx,QWORD[56+rax]
+	mov	rbp,QWORD[((56+8))+rax]
+
+	mov	r10d,DWORD[240+rcx]
+	mov	r11,rcx
+	shl	r10d,4
+	movups	xmm9,XMMWORD[rcx]
+	movups	xmm1,XMMWORD[16+r10*1+rcx]
+
+	movdqu	xmm15,XMMWORD[r9]
+	pxor	xmm9,xmm1
+	pxor	xmm15,xmm1
+
+	mov	eax,16+32
+	lea	rcx,[32+r10*1+r11]
+	movups	xmm1,XMMWORD[16+r11]
+	sub	rax,r10
+	mov	r10,rax
+
+	movdqu	xmm10,XMMWORD[rbx]
+	movdqu	xmm8,XMMWORD[rbp]
+
+	test	r8,1
+	jnz	NEAR $L$ocb_dec_odd
+
+	bsf	r12,r8
+	add	r8,1
+	shl	r12,4
+	movdqu	xmm7,XMMWORD[r12*1+rbx]
+	movdqu	xmm2,XMMWORD[rdi]
+	lea	rdi,[16+rdi]
+
+	call	__ocb_decrypt1
+
+	movdqa	xmm15,xmm7
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm8,xmm2
+	lea	rsi,[16+rsi]
+	sub	rdx,1
+	jz	NEAR $L$ocb_dec_done
+
+$L$ocb_dec_odd:
+	lea	r12,[1+r8]
+	lea	r13,[3+r8]
+	lea	r14,[5+r8]
+	lea	r8,[6+r8]
+	bsf	r12,r12
+	bsf	r13,r13
+	bsf	r14,r14
+	shl	r12,4
+	shl	r13,4
+	shl	r14,4
+
+	sub	rdx,6
+	jc	NEAR $L$ocb_dec_short
+	jmp	NEAR $L$ocb_dec_grandloop
+
+ALIGN	32
+$L$ocb_dec_grandloop:
+	movdqu	xmm2,XMMWORD[rdi]
+	movdqu	xmm3,XMMWORD[16+rdi]
+	movdqu	xmm4,XMMWORD[32+rdi]
+	movdqu	xmm5,XMMWORD[48+rdi]
+	movdqu	xmm6,XMMWORD[64+rdi]
+	movdqu	xmm7,XMMWORD[80+rdi]
+	lea	rdi,[96+rdi]
+
+	call	__ocb_decrypt6
+
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm8,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm8,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm8,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm8,xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	pxor	xmm8,xmm6
+	movups	XMMWORD[80+rsi],xmm7
+	pxor	xmm8,xmm7
+	lea	rsi,[96+rsi]
+	sub	rdx,6
+	jnc	NEAR $L$ocb_dec_grandloop
+
+$L$ocb_dec_short:
+	add	rdx,6
+	jz	NEAR $L$ocb_dec_done
+
+	movdqu	xmm2,XMMWORD[rdi]
+	cmp	rdx,2
+	jb	NEAR $L$ocb_dec_one
+	movdqu	xmm3,XMMWORD[16+rdi]
+	je	NEAR $L$ocb_dec_two
+
+	movdqu	xmm4,XMMWORD[32+rdi]
+	cmp	rdx,4
+	jb	NEAR $L$ocb_dec_three
+	movdqu	xmm5,XMMWORD[48+rdi]
+	je	NEAR $L$ocb_dec_four
+
+	movdqu	xmm6,XMMWORD[64+rdi]
+	pxor	xmm7,xmm7
+
+	call	__ocb_decrypt6
+
+	movdqa	xmm15,xmm14
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm8,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm8,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm8,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm8,xmm5
+	movups	XMMWORD[64+rsi],xmm6
+	pxor	xmm8,xmm6
+
+	jmp	NEAR $L$ocb_dec_done
+
+ALIGN	16
+$L$ocb_dec_one:
+	movdqa	xmm7,xmm10
+
+	call	__ocb_decrypt1
+
+	movdqa	xmm15,xmm7
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm8,xmm2
+	jmp	NEAR $L$ocb_dec_done
+
+ALIGN	16
+$L$ocb_dec_two:
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+
+	call	__ocb_decrypt4
+
+	movdqa	xmm15,xmm11
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm8,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	xorps	xmm8,xmm3
+
+	jmp	NEAR $L$ocb_dec_done
+
+ALIGN	16
+$L$ocb_dec_three:
+	pxor	xmm5,xmm5
+
+	call	__ocb_decrypt4
+
+	movdqa	xmm15,xmm12
+	movups	XMMWORD[rsi],xmm2
+	xorps	xmm8,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	xorps	xmm8,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	xorps	xmm8,xmm4
+
+	jmp	NEAR $L$ocb_dec_done
+
+ALIGN	16
+$L$ocb_dec_four:
+	call	__ocb_decrypt4
+
+	movdqa	xmm15,xmm13
+	movups	XMMWORD[rsi],xmm2
+	pxor	xmm8,xmm2
+	movups	XMMWORD[16+rsi],xmm3
+	pxor	xmm8,xmm3
+	movups	XMMWORD[32+rsi],xmm4
+	pxor	xmm8,xmm4
+	movups	XMMWORD[48+rsi],xmm5
+	pxor	xmm8,xmm5
+
+$L$ocb_dec_done:
+	pxor	xmm15,xmm0
+	movdqu	XMMWORD[rbp],xmm8
+	movdqu	XMMWORD[r9],xmm15
+
+	xorps	xmm0,xmm0
+	pxor	xmm1,xmm1
+	pxor	xmm2,xmm2
+	pxor	xmm3,xmm3
+	pxor	xmm4,xmm4
+	pxor	xmm5,xmm5
+	movaps	xmm6,XMMWORD[rsp]
+	movaps	XMMWORD[rsp],xmm0
+	movaps	xmm7,XMMWORD[16+rsp]
+	movaps	XMMWORD[16+rsp],xmm0
+	movaps	xmm8,XMMWORD[32+rsp]
+	movaps	XMMWORD[32+rsp],xmm0
+	movaps	xmm9,XMMWORD[48+rsp]
+	movaps	XMMWORD[48+rsp],xmm0
+	movaps	xmm10,XMMWORD[64+rsp]
+	movaps	XMMWORD[64+rsp],xmm0
+	movaps	xmm11,XMMWORD[80+rsp]
+	movaps	XMMWORD[80+rsp],xmm0
+	movaps	xmm12,XMMWORD[96+rsp]
+	movaps	XMMWORD[96+rsp],xmm0
+	movaps	xmm13,XMMWORD[112+rsp]
+	movaps	XMMWORD[112+rsp],xmm0
+	movaps	xmm14,XMMWORD[128+rsp]
+	movaps	XMMWORD[128+rsp],xmm0
+	movaps	xmm15,XMMWORD[144+rsp]
+	movaps	XMMWORD[144+rsp],xmm0
+	lea	rax,[((160+40))+rsp]
+$L$ocb_dec_pop:
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	rbx,QWORD[((-8))+rax]
+	lea	rsp,[rax]
+$L$ocb_dec_epilogue:
+	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
+	mov	rsi,QWORD[16+rsp]
+	DB	0F3h,0C3h		;repret
+$L$SEH_end_aesni_ocb_decrypt:
+
+
+ALIGN	32
+__ocb_decrypt6:
+	pxor	xmm15,xmm9
+	movdqu	xmm11,XMMWORD[r12*1+rbx]
+	movdqa	xmm12,xmm10
+	movdqu	xmm13,XMMWORD[r13*1+rbx]
+	movdqa	xmm14,xmm10
+	pxor	xmm10,xmm15
+	movdqu	xmm15,XMMWORD[r14*1+rbx]
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm10
+	pxor	xmm12,xmm11
+	pxor	xmm3,xmm11
+	pxor	xmm13,xmm12
+	pxor	xmm4,xmm12
+	pxor	xmm14,xmm13
+	pxor	xmm5,xmm13
+	pxor	xmm15,xmm14
+	pxor	xmm6,xmm14
+	pxor	xmm7,xmm15
+	movups	xmm0,XMMWORD[32+r11]
+
+	lea	r12,[1+r8]
+	lea	r13,[3+r8]
+	lea	r14,[5+r8]
+	add	r8,6
+	pxor	xmm10,xmm9
+	bsf	r12,r12
+	bsf	r13,r13
+	bsf	r14,r14
+
+DB	102,15,56,222,209
+DB	102,15,56,222,217
+DB	102,15,56,222,225
+DB	102,15,56,222,233
+	pxor	xmm11,xmm9
+	pxor	xmm12,xmm9
+DB	102,15,56,222,241
+	pxor	xmm13,xmm9
+	pxor	xmm14,xmm9
+DB	102,15,56,222,249
+	movups	xmm1,XMMWORD[48+r11]
+	pxor	xmm15,xmm9
+
+DB	102,15,56,222,208
+DB	102,15,56,222,216
+DB	102,15,56,222,224
+DB	102,15,56,222,232
+DB	102,15,56,222,240
+DB	102,15,56,222,248
+	movups	xmm0,XMMWORD[64+r11]
+	shl	r12,4
+	shl	r13,4
+	jmp	NEAR $L$ocb_dec_loop6
+
+ALIGN	32
+$L$ocb_dec_loop6:
+DB	102,15,56,222,209
+DB	102,15,56,222,217
+DB	102,15,56,222,225
+DB	102,15,56,222,233
+DB	102,15,56,222,241
+DB	102,15,56,222,249
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+
+DB	102,15,56,222,208
+DB	102,15,56,222,216
+DB	102,15,56,222,224
+DB	102,15,56,222,232
+DB	102,15,56,222,240
+DB	102,15,56,222,248
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$ocb_dec_loop6
+
+DB	102,15,56,222,209
+DB	102,15,56,222,217
+DB	102,15,56,222,225
+DB	102,15,56,222,233
+DB	102,15,56,222,241
+DB	102,15,56,222,249
+	movups	xmm1,XMMWORD[16+r11]
+	shl	r14,4
+
+DB	102,65,15,56,223,210
+	movdqu	xmm10,XMMWORD[rbx]
+	mov	rax,r10
+DB	102,65,15,56,223,219
+DB	102,65,15,56,223,228
+DB	102,65,15,56,223,237
+DB	102,65,15,56,223,246
+DB	102,65,15,56,223,255
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+__ocb_decrypt4:
+	pxor	xmm15,xmm9
+	movdqu	xmm11,XMMWORD[r12*1+rbx]
+	movdqa	xmm12,xmm10
+	movdqu	xmm13,XMMWORD[r13*1+rbx]
+	pxor	xmm10,xmm15
+	pxor	xmm11,xmm10
+	pxor	xmm2,xmm10
+	pxor	xmm12,xmm11
+	pxor	xmm3,xmm11
+	pxor	xmm13,xmm12
+	pxor	xmm4,xmm12
+	pxor	xmm5,xmm13
+	movups	xmm0,XMMWORD[32+r11]
+
+	pxor	xmm10,xmm9
+	pxor	xmm11,xmm9
+	pxor	xmm12,xmm9
+	pxor	xmm13,xmm9
+
+DB	102,15,56,222,209
+DB	102,15,56,222,217
+DB	102,15,56,222,225
+DB	102,15,56,222,233
+	movups	xmm1,XMMWORD[48+r11]
+
+DB	102,15,56,222,208
+DB	102,15,56,222,216
+DB	102,15,56,222,224
+DB	102,15,56,222,232
+	movups	xmm0,XMMWORD[64+r11]
+	jmp	NEAR $L$ocb_dec_loop4
+
+ALIGN	32
+$L$ocb_dec_loop4:
+DB	102,15,56,222,209
+DB	102,15,56,222,217
+DB	102,15,56,222,225
+DB	102,15,56,222,233
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+
+DB	102,15,56,222,208
+DB	102,15,56,222,216
+DB	102,15,56,222,224
+DB	102,15,56,222,232
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$ocb_dec_loop4
+
+DB	102,15,56,222,209
+DB	102,15,56,222,217
+DB	102,15,56,222,225
+DB	102,15,56,222,233
+	movups	xmm1,XMMWORD[16+r11]
+	mov	rax,r10
+
+DB	102,65,15,56,223,210
+DB	102,65,15,56,223,219
+DB	102,65,15,56,223,228
+DB	102,65,15,56,223,237
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	32
+__ocb_decrypt1:
+	pxor	xmm7,xmm15
+	pxor	xmm7,xmm9
+	pxor	xmm2,xmm7
+	movups	xmm0,XMMWORD[32+r11]
+
+DB	102,15,56,222,209
+	movups	xmm1,XMMWORD[48+r11]
+	pxor	xmm7,xmm9
+
+DB	102,15,56,222,208
+	movups	xmm0,XMMWORD[64+r11]
+	jmp	NEAR $L$ocb_dec_loop1
+
+ALIGN	32
+$L$ocb_dec_loop1:
+DB	102,15,56,222,209
+	movups	xmm1,XMMWORD[rax*1+rcx]
+	add	rax,32
+
+DB	102,15,56,222,208
+	movups	xmm0,XMMWORD[((-16))+rax*1+rcx]
+	jnz	NEAR $L$ocb_dec_loop1
+
+DB	102,15,56,222,209
+	movups	xmm1,XMMWORD[16+r11]
+	mov	rax,r10
+
+DB	102,15,56,223,215
+	DB	0F3h,0C3h		;repret
+
 global	aesni_cbc_encrypt
 
 ALIGN	16
@@ -2837,7 +3722,7 @@
 	jmp	NEAR $L$cbc_ret
 ALIGN	16
 $L$cbc_decrypt_bulk:
-	lea	rax,[rsp]
+	lea	r11,[rsp]
 	push	rbp
 	sub	rsp,176
 	and	rsp,-16
@@ -2852,7 +3737,7 @@
 	movaps	XMMWORD[144+rsp],xmm14
 	movaps	XMMWORD[160+rsp],xmm15
 $L$cbc_decrypt_body:
-	lea	rbp,[((-8))+rax]
+	mov	rbp,rcx
 	movups	xmm10,XMMWORD[r8]
 	mov	eax,r10d
 	cmp	rdx,0x50
@@ -2892,7 +3777,7 @@
 	pxor	xmm3,xmm0
 	movups	xmm1,XMMWORD[((16-112))+rcx]
 	pxor	xmm4,xmm0
-	xor	r11,r11
+	mov	rbp,-1
 	cmp	rdx,0x70
 	pxor	xmm5,xmm0
 	pxor	xmm6,xmm0
@@ -2908,10 +3793,10 @@
 DB	102,15,56,222,241
 DB	102,15,56,222,249
 DB	102,68,15,56,222,193
-	setnc	r11b
-	shl	r11,7
+	adc	rbp,0
+	and	rbp,128
 DB	102,68,15,56,222,201
-	add	r11,rdi
+	add	rbp,rdi
 	movups	xmm1,XMMWORD[((48-112))+rcx]
 DB	102,15,56,222,208
 DB	102,15,56,222,216
@@ -3049,18 +3934,18 @@
 	movdqu	xmm0,XMMWORD[112+rdi]
 DB	102,65,15,56,223,228
 	lea	rdi,[128+rdi]
-	movdqu	xmm11,XMMWORD[r11]
+	movdqu	xmm11,XMMWORD[rbp]
 DB	102,65,15,56,223,237
 DB	102,65,15,56,223,246
-	movdqu	xmm12,XMMWORD[16+r11]
-	movdqu	xmm13,XMMWORD[32+r11]
+	movdqu	xmm12,XMMWORD[16+rbp]
+	movdqu	xmm13,XMMWORD[32+rbp]
 DB	102,65,15,56,223,255
 DB	102,68,15,56,223,193
-	movdqu	xmm14,XMMWORD[48+r11]
-	movdqu	xmm15,XMMWORD[64+r11]
+	movdqu	xmm14,XMMWORD[48+rbp]
+	movdqu	xmm15,XMMWORD[64+rbp]
 DB	102,69,15,56,223,202
 	movdqa	xmm10,xmm0
-	movdqu	xmm1,XMMWORD[80+r11]
+	movdqu	xmm1,XMMWORD[80+rbp]
 	movups	xmm0,XMMWORD[((-112))+rcx]
 
 	movups	XMMWORD[rsi],xmm2
@@ -3179,7 +4064,7 @@
 	pxor	xmm5,xmm13
 	movdqu	XMMWORD[32+rsi],xmm4
 	pxor	xmm6,xmm14
-	mov	rcx,r11
+	mov	rcx,rbp
 	movdqu	XMMWORD[48+rsi],xmm5
 	pxor	xmm7,xmm15
 	mov	eax,r10d
@@ -3348,8 +4233,8 @@
 	movaps	XMMWORD[144+rsp],xmm0
 	movaps	xmm15,XMMWORD[160+rsp]
 	movaps	XMMWORD[160+rsp],xmm0
-	lea	rsp,[rbp]
-	pop	rbp
+	mov	rbp,QWORD[((-8))+r11]
+	lea	rsp,[r11]
 $L$cbc_ret:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -3865,13 +4750,75 @@
 	cmp	rbx,r10
 	jae	NEAR $L$common_seh_tail
 
-	mov	rax,QWORD[160+r8]
-	lea	rsi,[((-160))+rax]
+	mov	rax,QWORD[208+r8]
+
+	lea	rsi,[((-168))+rax]
 	lea	rdi,[512+r8]
 	mov	ecx,20
 	DD	0xa548f3fc
 
-	jmp	NEAR $L$common_rbp_tail
+	mov	rbp,QWORD[((-8))+rax]
+	mov	QWORD[160+r8],rbp
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+ocb_se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$ocb_no_xmm
+
+	mov	rax,QWORD[152+r8]
+
+	lea	rsi,[rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+	lea	rax,[((160+40))+rax]
+
+$L$ocb_no_xmm:
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+
+	jmp	NEAR $L$common_seh_tail
 
 
 ALIGN	16
@@ -3894,9 +4841,13 @@
 	cmp	rbx,r10
 	jb	NEAR $L$common_seh_tail
 
+	mov	rax,QWORD[120+r8]
+
 	lea	r10,[$L$cbc_decrypt_body]
 	cmp	rbx,r10
-	jb	NEAR $L$restore_cbc_rax
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
 
 	lea	r10,[$L$cbc_ret]
 	cmp	rbx,r10
@@ -3907,15 +4858,10 @@
 	mov	ecx,20
 	DD	0xa548f3fc
 
-$L$common_rbp_tail:
-	mov	rax,QWORD[160+r8]
-	mov	rbp,QWORD[rax]
-	lea	rax,[8+rax]
-	mov	QWORD[160+r8],rbp
-	jmp	NEAR $L$common_seh_tail
+	mov	rax,QWORD[208+r8]
 
-$L$restore_cbc_rax:
-	mov	rax,QWORD[120+r8]
+	mov	rbp,QWORD[((-8))+rax]
+	mov	QWORD[160+r8],rbp
 
 $L$common_seh_tail:
 	mov	rdi,QWORD[8+rax]
@@ -3982,6 +4928,14 @@
 	DD	$L$SEH_begin_aesni_xts_decrypt wrt ..imagebase
 	DD	$L$SEH_end_aesni_xts_decrypt wrt ..imagebase
 	DD	$L$SEH_info_xts_dec wrt ..imagebase
+
+	DD	$L$SEH_begin_aesni_ocb_encrypt wrt ..imagebase
+	DD	$L$SEH_end_aesni_ocb_encrypt wrt ..imagebase
+	DD	$L$SEH_info_ocb_enc wrt ..imagebase
+
+	DD	$L$SEH_begin_aesni_ocb_decrypt wrt ..imagebase
+	DD	$L$SEH_end_aesni_ocb_decrypt wrt ..imagebase
+	DD	$L$SEH_info_ocb_dec wrt ..imagebase
 	DD	$L$SEH_begin_aesni_cbc_encrypt wrt ..imagebase
 	DD	$L$SEH_end_aesni_cbc_encrypt wrt ..imagebase
 	DD	$L$SEH_info_cbc wrt ..imagebase
@@ -4019,6 +4973,18 @@
 DB	9,0,0,0
 	DD	ctr_xts_se_handler wrt ..imagebase
 	DD	$L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase
+$L$SEH_info_ocb_enc:
+DB	9,0,0,0
+	DD	ocb_se_handler wrt ..imagebase
+	DD	$L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase
+	DD	$L$ocb_enc_pop wrt ..imagebase
+	DD	0
+$L$SEH_info_ocb_dec:
+DB	9,0,0,0
+	DD	ocb_se_handler wrt ..imagebase
+	DD	$L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase
+	DD	$L$ocb_dec_pop wrt ..imagebase
+	DD	0
 $L$SEH_info_cbc:
 DB	9,0,0,0
 	DD	cbc_se_handler wrt ..imagebase
diff --git a/win-x86_64/crypto/aes/bsaes-x86_64.asm b/win-x86_64/crypto/aes/bsaes-x86_64.asm
index 6d75248..9c6d129 100644
--- a/win-x86_64/crypto/aes/bsaes-x86_64.asm
+++ b/win-x86_64/crypto/aes/bsaes-x86_64.asm
@@ -1319,7 +1319,7 @@
 	cmp	rbp,rax
 	ja	NEAR $L$cbc_dec_bzero
 
-	lea	rsp,[rbp]
+	lea	rax,[120+rbp]
 	movaps	xmm6,XMMWORD[64+rbp]
 	movaps	xmm7,XMMWORD[80+rbp]
 	movaps	xmm8,XMMWORD[96+rbp]
@@ -1330,15 +1330,15 @@
 	movaps	xmm13,XMMWORD[176+rbp]
 	movaps	xmm14,XMMWORD[192+rbp]
 	movaps	xmm15,XMMWORD[208+rbp]
-	lea	rsp,[160+rbp]
-	mov	r15,QWORD[72+rsp]
-	mov	r14,QWORD[80+rsp]
-	mov	r13,QWORD[88+rsp]
-	mov	r12,QWORD[96+rsp]
-	mov	rbx,QWORD[104+rsp]
-	mov	rax,QWORD[112+rsp]
-	lea	rsp,[120+rsp]
-	mov	rbp,rax
+	lea	rax,[160+rax]
+$L$cbc_dec_tail:
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	rbp,QWORD[((-8))+rax]
+	lea	rsp,[rax]
 $L$cbc_dec_epilogue:
 	DB	0F3h,0C3h		;repret
 
@@ -1543,7 +1543,7 @@
 	cmp	rbp,rax
 	ja	NEAR $L$ctr_enc_bzero
 
-	lea	rsp,[rbp]
+	lea	rax,[120+rbp]
 	movaps	xmm6,XMMWORD[64+rbp]
 	movaps	xmm7,XMMWORD[80+rbp]
 	movaps	xmm8,XMMWORD[96+rbp]
@@ -1554,15 +1554,15 @@
 	movaps	xmm13,XMMWORD[176+rbp]
 	movaps	xmm14,XMMWORD[192+rbp]
 	movaps	xmm15,XMMWORD[208+rbp]
-	lea	rsp,[160+rbp]
-	mov	r15,QWORD[72+rsp]
-	mov	r14,QWORD[80+rsp]
-	mov	r13,QWORD[88+rsp]
-	mov	r12,QWORD[96+rsp]
-	mov	rbx,QWORD[104+rsp]
-	mov	rax,QWORD[112+rsp]
-	lea	rsp,[120+rsp]
-	mov	rbp,rax
+	lea	rax,[160+rax]
+$L$ctr_enc_tail:
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	rbp,QWORD[((-8))+rax]
+	lea	rsp,[rax]
 $L$ctr_enc_epilogue:
 	DB	0F3h,0C3h		;repret
 
@@ -2019,7 +2019,7 @@
 	cmp	rbp,rax
 	ja	NEAR $L$xts_enc_bzero
 
-	lea	rsp,[rbp]
+	lea	rax,[120+rbp]
 	movaps	xmm6,XMMWORD[64+rbp]
 	movaps	xmm7,XMMWORD[80+rbp]
 	movaps	xmm8,XMMWORD[96+rbp]
@@ -2030,15 +2030,15 @@
 	movaps	xmm13,XMMWORD[176+rbp]
 	movaps	xmm14,XMMWORD[192+rbp]
 	movaps	xmm15,XMMWORD[208+rbp]
-	lea	rsp,[160+rbp]
-	mov	r15,QWORD[72+rsp]
-	mov	r14,QWORD[80+rsp]
-	mov	r13,QWORD[88+rsp]
-	mov	r12,QWORD[96+rsp]
-	mov	rbx,QWORD[104+rsp]
-	mov	rax,QWORD[112+rsp]
-	lea	rsp,[120+rsp]
-	mov	rbp,rax
+	lea	rax,[160+rax]
+$L$xts_enc_tail:
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	rbp,QWORD[((-8))+rax]
+	lea	rsp,[rax]
 $L$xts_enc_epilogue:
 	DB	0F3h,0C3h		;repret
 
@@ -2522,7 +2522,7 @@
 	cmp	rbp,rax
 	ja	NEAR $L$xts_dec_bzero
 
-	lea	rsp,[rbp]
+	lea	rax,[120+rbp]
 	movaps	xmm6,XMMWORD[64+rbp]
 	movaps	xmm7,XMMWORD[80+rbp]
 	movaps	xmm8,XMMWORD[96+rbp]
@@ -2533,15 +2533,15 @@
 	movaps	xmm13,XMMWORD[176+rbp]
 	movaps	xmm14,XMMWORD[192+rbp]
 	movaps	xmm15,XMMWORD[208+rbp]
-	lea	rsp,[160+rbp]
-	mov	r15,QWORD[72+rsp]
-	mov	r14,QWORD[80+rsp]
-	mov	r13,QWORD[88+rsp]
-	mov	r12,QWORD[96+rsp]
-	mov	rbx,QWORD[104+rsp]
-	mov	rax,QWORD[112+rsp]
-	lea	rsp,[120+rsp]
-	mov	rbp,rax
+	lea	rax,[160+rax]
+$L$xts_dec_tail:
+	mov	r15,QWORD[((-48))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	rbx,QWORD[((-16))+rax]
+	mov	rbp,QWORD[((-8))+rax]
+	lea	rsp,[rax]
 $L$xts_dec_epilogue:
 	DB	0F3h,0C3h		;repret
 
@@ -2628,30 +2628,33 @@
 	mov	r10d,DWORD[r11]
 	lea	r10,[r10*1+rsi]
 	cmp	rbx,r10
-	jb	NEAR $L$in_prologue
-
-	mov	rax,QWORD[152+r8]
+	jbe	NEAR $L$in_prologue
 
 	mov	r10d,DWORD[4+r11]
 	lea	r10,[r10*1+rsi]
 	cmp	rbx,r10
 	jae	NEAR $L$in_prologue
 
+	mov	r10d,DWORD[8+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$in_tail
+
 	mov	rax,QWORD[160+r8]
 
 	lea	rsi,[64+rax]
 	lea	rdi,[512+r8]
 	mov	ecx,20
 	DD	0xa548f3fc
-	lea	rax,[160+rax]
+	lea	rax,[((160+120))+rax]
 
-	mov	rbp,QWORD[112+rax]
-	mov	rbx,QWORD[104+rax]
-	mov	r12,QWORD[96+rax]
-	mov	r13,QWORD[88+rax]
-	mov	r14,QWORD[80+rax]
-	mov	r15,QWORD[72+rax]
-	lea	rax,[120+rax]
+$L$in_tail:
+	mov	rbp,QWORD[((-48))+rax]
+	mov	rbx,QWORD[((-40))+rax]
+	mov	r12,QWORD[((-32))+rax]
+	mov	r13,QWORD[((-24))+rax]
+	mov	r14,QWORD[((-16))+rax]
+	mov	r15,QWORD[((-8))+rax]
 	mov	QWORD[144+r8],rbx
 	mov	QWORD[160+r8],rbp
 	mov	QWORD[216+r8],r12
@@ -2719,15 +2722,23 @@
 DB	9,0,0,0
 	DD	se_handler wrt ..imagebase
 	DD	$L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase
+	DD	$L$cbc_dec_tail wrt ..imagebase
+	DD	0
 $L$ctr_enc_info:
 DB	9,0,0,0
 	DD	se_handler wrt ..imagebase
 	DD	$L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase
+	DD	$L$ctr_enc_tail wrt ..imagebase
+	DD	0
 $L$xts_enc_info:
 DB	9,0,0,0
 	DD	se_handler wrt ..imagebase
 	DD	$L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase
+	DD	$L$xts_enc_tail wrt ..imagebase
+	DD	0
 $L$xts_dec_info:
 DB	9,0,0,0
 	DD	se_handler wrt ..imagebase
 	DD	$L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase
+	DD	$L$xts_dec_tail wrt ..imagebase
+	DD	0
diff --git a/win-x86_64/crypto/bn/x86_64-mont.asm b/win-x86_64/crypto/bn/x86_64-mont.asm
index 4d8e1cb..1a9da51 100644
--- a/win-x86_64/crypto/bn/x86_64-mont.asm
+++ b/win-x86_64/crypto/bn/x86_64-mont.asm
@@ -23,6 +23,10 @@
 	mov	r9,QWORD[48+rsp]
 
 
+
+	mov	r9d,r9d
+	mov	rax,rsp
+
 	test	r9d,3
 	jnz	NEAR $L$mul_enter
 	cmp	r9d,8
@@ -36,20 +40,50 @@
 ALIGN	16
 $L$mul_enter:
 	push	rbx
+
 	push	rbp
+
 	push	r12
+
 	push	r13
+
 	push	r14
+
 	push	r15
 
-	mov	r9d,r9d
-	lea	r10,[2+r9]
-	mov	r11,rsp
-	neg	r10
-	lea	rsp,[r10*8+rsp]
-	and	rsp,-1024
 
-	mov	QWORD[8+r9*8+rsp],r11
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-16))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+ALIGN	16
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
 $L$mul_body:
 	mov	r12,rdx
 	mov	r8,QWORD[r8]
@@ -201,33 +235,43 @@
 
 	sbb	rax,0
 	xor	r14,r14
+	and	rsi,rax
+	not	rax
+	mov	rcx,rdi
+	and	rcx,rax
 	mov	r15,r9
+	or	rsi,rcx
 ALIGN	16
 $L$copy:
-	mov	rsi,QWORD[r14*8+rsp]
-	mov	rcx,QWORD[r14*8+rdi]
-	xor	rsi,rcx
-	and	rsi,rax
-	xor	rsi,rcx
+	mov	rax,QWORD[r14*8+rsi]
 	mov	QWORD[r14*8+rsp],r14
-	mov	QWORD[r14*8+rdi],rsi
+	mov	QWORD[r14*8+rdi],rax
 	lea	r14,[1+r14]
 	sub	r15,1
 	jnz	NEAR $L$copy
 
 	mov	rsi,QWORD[8+r9*8+rsp]
+
 	mov	rax,1
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
 $L$mul_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_mul_mont:
 
 ALIGN	16
@@ -244,22 +288,47 @@
 	mov	r9,QWORD[48+rsp]
 
 
-$L$mul4x_enter:
-	push	rbx
-	push	rbp
-	push	r12
-	push	r13
-	push	r14
-	push	r15
 
 	mov	r9d,r9d
-	lea	r10,[4+r9]
-	mov	r11,rsp
-	neg	r10
-	lea	rsp,[r10*8+rsp]
-	and	rsp,-1024
+	mov	rax,rsp
 
-	mov	QWORD[8+r9*8+rsp],r11
+$L$mul4x_enter:
+	push	rbx
+
+	push	rbp
+
+	push	r12
+
+	push	r13
+
+	push	r14
+
+	push	r15
+
+
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-32))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
+	mov	QWORD[8+r9*8+rsp],rax
+
 $L$mul4x_body:
 	mov	QWORD[16+r9*8+rsp],rdi
 	mov	r12,rdx
@@ -559,9 +628,11 @@
 	cmp	r14,r9
 	jb	NEAR $L$outer4x
 	mov	rdi,QWORD[16+r9*8+rsp]
+	lea	r15,[((-4))+r9]
 	mov	rax,QWORD[rsp]
+	pxor	xmm0,xmm0
 	mov	rdx,QWORD[8+rsp]
-	shr	r9,2
+	shr	r15,2
 	lea	rsi,[rsp]
 	xor	r14,r14
 
@@ -569,7 +640,6 @@
 	mov	rbx,QWORD[16+rsi]
 	mov	rbp,QWORD[24+rsi]
 	sbb	rdx,QWORD[8+rcx]
-	lea	r15,[((-1))+r9]
 	jmp	NEAR $L$sub4x
 ALIGN	16
 $L$sub4x:
@@ -597,49 +667,57 @@
 	mov	QWORD[16+r14*8+rdi],rbx
 
 	sbb	rax,0
-DB 66h, 48h, 0fh, 6eh, 0c0h
-	punpcklqdq	xmm0,xmm0
 	mov	QWORD[24+r14*8+rdi],rbp
 	xor	r14,r14
+	and	rsi,rax
+	not	rax
+	mov	rcx,rdi
+	and	rcx,rax
+	lea	r15,[((-4))+r9]
+	or	rsi,rcx
+	shr	r15,2
 
-	mov	r15,r9
-	pxor	xmm5,xmm5
+	movdqu	xmm1,XMMWORD[rsi]
+	movdqa	XMMWORD[rsp],xmm0
+	movdqu	XMMWORD[rdi],xmm1
 	jmp	NEAR $L$copy4x
 ALIGN	16
 $L$copy4x:
-	movdqu	xmm2,XMMWORD[r14*1+rsp]
-	movdqu	xmm4,XMMWORD[16+r14*1+rsp]
-	movdqu	xmm1,XMMWORD[r14*1+rdi]
-	movdqu	xmm3,XMMWORD[16+r14*1+rdi]
-	pxor	xmm2,xmm1
-	pxor	xmm4,xmm3
-	pand	xmm2,xmm0
-	pand	xmm4,xmm0
-	pxor	xmm2,xmm1
-	pxor	xmm4,xmm3
-	movdqu	XMMWORD[r14*1+rdi],xmm2
-	movdqu	XMMWORD[16+r14*1+rdi],xmm4
-	movdqa	XMMWORD[r14*1+rsp],xmm5
-	movdqa	XMMWORD[16+r14*1+rsp],xmm5
-
+	movdqu	xmm2,XMMWORD[16+r14*1+rsi]
+	movdqu	xmm1,XMMWORD[32+r14*1+rsi]
+	movdqa	XMMWORD[16+r14*1+rsp],xmm0
+	movdqu	XMMWORD[16+r14*1+rdi],xmm2
+	movdqa	XMMWORD[32+r14*1+rsp],xmm0
+	movdqu	XMMWORD[32+r14*1+rdi],xmm1
 	lea	r14,[32+r14]
 	dec	r15
 	jnz	NEAR $L$copy4x
 
-	shl	r9,2
+	movdqu	xmm2,XMMWORD[16+r14*1+rsi]
+	movdqa	XMMWORD[16+r14*1+rsp],xmm0
+	movdqu	XMMWORD[16+r14*1+rdi],xmm2
 	mov	rsi,QWORD[8+r9*8+rsp]
+
 	mov	rax,1
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+
+	mov	r14,QWORD[((-40))+rsi]
+
+	mov	r13,QWORD[((-32))+rsi]
+
+	mov	r12,QWORD[((-24))+rsi]
+
+	mov	rbp,QWORD[((-16))+rsi]
+
+	mov	rbx,QWORD[((-8))+rsi]
+
+	lea	rsp,[rsi]
+
 $L$mul4x_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_mul4x_mont:
 EXTERN	bn_sqr8x_internal
 
@@ -658,15 +736,24 @@
 	mov	r9,QWORD[48+rsp]
 
 
-$L$sqr8x_enter:
+
 	mov	rax,rsp
+
+$L$sqr8x_enter:
 	push	rbx
+
 	push	rbp
+
 	push	r12
+
 	push	r13
+
 	push	r14
+
 	push	r15
 
+$L$sqr8x_prologue:
+
 	mov	r10d,r9d
 	shl	r9d,3
 	shl	r10,3+2
@@ -678,30 +765,49 @@
 
 
 	lea	r11,[((-64))+r9*2+rsp]
+	mov	rbp,rsp
 	mov	r8,QWORD[r8]
 	sub	r11,rsi
 	and	r11,4095
 	cmp	r10,r11
 	jb	NEAR $L$sqr8x_sp_alt
-	sub	rsp,r11
-	lea	rsp,[((-64))+r9*2+rsp]
+	sub	rbp,r11
+	lea	rbp,[((-64))+r9*2+rbp]
 	jmp	NEAR $L$sqr8x_sp_done
 
 ALIGN	32
 $L$sqr8x_sp_alt:
 	lea	r10,[((4096-64))+r9*2]
-	lea	rsp,[((-64))+r9*2+rsp]
+	lea	rbp,[((-64))+r9*2+rbp]
 	sub	r11,r10
 	mov	r10,0
 	cmovc	r11,r10
-	sub	rsp,r11
+	sub	rbp,r11
 $L$sqr8x_sp_done:
-	and	rsp,-64
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+	jmp	NEAR $L$sqr8x_page_walk_done
+
+ALIGN	16
+$L$sqr8x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$sqr8x_page_walk
+$L$sqr8x_page_walk_done:
+
 	mov	r10,r9
 	neg	r9
 
 	mov	QWORD[32+rsp],r8
 	mov	QWORD[40+rsp],rax
+
 $L$sqr8x_body:
 
 DB	102,72,15,110,209
@@ -748,6 +854,7 @@
 	pxor	xmm0,xmm0
 	pshufd	xmm1,xmm1,0
 	mov	rsi,QWORD[40+rsp]
+
 	jmp	NEAR $L$sqr8x_cond_copy
 
 ALIGN	32
@@ -777,16 +884,24 @@
 
 	mov	rax,1
 	mov	r15,QWORD[((-48))+rsi]
+
 	mov	r14,QWORD[((-40))+rsi]
+
 	mov	r13,QWORD[((-32))+rsi]
+
 	mov	r12,QWORD[((-24))+rsi]
+
 	mov	rbp,QWORD[((-16))+rsi]
+
 	mov	rbx,QWORD[((-8))+rsi]
+
 	lea	rsp,[rsi]
+
 $L$sqr8x_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_sqr8x_mont:
 DB	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
 DB	112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
@@ -829,22 +944,8 @@
 
 	mov	r10,QWORD[192+r8]
 	mov	rax,QWORD[8+r10*8+rax]
-	lea	rax,[48+rax]
 
-	mov	rbx,QWORD[((-8))+rax]
-	mov	rbp,QWORD[((-16))+rax]
-	mov	r12,QWORD[((-24))+rax]
-	mov	r13,QWORD[((-32))+rax]
-	mov	r14,QWORD[((-40))+rax]
-	mov	r15,QWORD[((-48))+rax]
-	mov	QWORD[144+r8],rbx
-	mov	QWORD[160+r8],rbp
-	mov	QWORD[216+r8],r12
-	mov	QWORD[224+r8],r13
-	mov	QWORD[232+r8],r14
-	mov	QWORD[240+r8],r15
-
-	jmp	NEAR $L$common_seh_tail
+	jmp	NEAR $L$common_pop_regs
 
 
 
@@ -872,15 +973,21 @@
 	cmp	rbx,r10
 	jb	NEAR $L$common_seh_tail
 
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
 	mov	rax,QWORD[152+r8]
 
-	mov	r10d,DWORD[4+r11]
+	mov	r10d,DWORD[8+r11]
 	lea	r10,[r10*1+rsi]
 	cmp	rbx,r10
 	jae	NEAR $L$common_seh_tail
 
 	mov	rax,QWORD[40+rax]
 
+$L$common_pop_regs:
 	mov	rbx,QWORD[((-8))+rax]
 	mov	rbp,QWORD[((-16))+rax]
 	mov	r12,QWORD[((-24))+rax]
@@ -960,4 +1067,5 @@
 $L$SEH_info_bn_sqr8x_mont:
 DB	9,0,0,0
 	DD	sqr_handler wrt ..imagebase
-	DD	$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+	DD	$L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase
+ALIGN	8
diff --git a/win-x86_64/crypto/bn/x86_64-mont5.asm b/win-x86_64/crypto/bn/x86_64-mont5.asm
index 58f19ac..b330641 100644
--- a/win-x86_64/crypto/bn/x86_64-mont5.asm
+++ b/win-x86_64/crypto/bn/x86_64-mont5.asm
@@ -23,30 +23,64 @@
 	mov	r9,QWORD[48+rsp]
 
 
+
+	mov	r9d,r9d
+	mov	rax,rsp
+
 	test	r9d,7
 	jnz	NEAR $L$mul_enter
 	jmp	NEAR $L$mul4x_enter
 
 ALIGN	16
 $L$mul_enter:
-	mov	r9d,r9d
-	mov	rax,rsp
 	movd	xmm5,DWORD[56+rsp]
-	lea	r10,[$L$inc]
 	push	rbx
+
 	push	rbp
+
 	push	r12
+
 	push	r13
+
 	push	r14
+
 	push	r15
 
-	lea	r11,[2+r9]
-	neg	r11
-	lea	rsp,[((-264))+r11*8+rsp]
-	and	rsp,-1024
 
+	neg	r9
+	mov	r11,rsp
+	lea	r10,[((-280))+r9*8+rsp]
+	neg	r9
+	and	r10,-1024
+
+
+
+
+
+
+
+
+
+	sub	r11,r10
+	and	r11,-4096
+	lea	rsp,[r11*1+r10]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+	jmp	NEAR $L$mul_page_walk_done
+
+$L$mul_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r11,QWORD[rsp]
+	cmp	rsp,r10
+	ja	NEAR $L$mul_page_walk
+$L$mul_page_walk_done:
+
+	lea	r10,[$L$inc]
 	mov	QWORD[8+r9*8+rsp],rax
+
 $L$mul_body:
+
 	lea	r12,[128+rdx]
 	movdqa	xmm0,XMMWORD[r10]
 	movdqa	xmm1,XMMWORD[16+r10]
@@ -385,34 +419,44 @@
 
 	sbb	rax,0
 	xor	r14,r14
+	and	rsi,rax
+	not	rax
+	mov	rcx,rdi
+	and	rcx,rax
 	mov	r15,r9
+	or	rsi,rcx
 ALIGN	16
 $L$copy:
-	mov	rsi,QWORD[r14*8+rsp]
-	mov	rcx,QWORD[r14*8+rdi]
-	xor	rsi,rcx
-	and	rsi,rax
-	xor	rsi,rcx
+	mov	rax,QWORD[r14*8+rsi]
 	mov	QWORD[r14*8+rsp],r14
-	mov	QWORD[r14*8+rdi],rsi
+	mov	QWORD[r14*8+rdi],rax
 	lea	r14,[1+r14]
 	sub	r15,1
 	jnz	NEAR $L$copy
 
 	mov	rsi,QWORD[8+r9*8+rsp]
+
 	mov	rax,1
 
 	mov	r15,QWORD[((-48))+rsi]
+
 	mov	r14,QWORD[((-40))+rsi]
+
 	mov	r13,QWORD[((-32))+rsi]
+
 	mov	r12,QWORD[((-24))+rsi]
+
 	mov	rbp,QWORD[((-16))+rsi]
+
 	mov	rbx,QWORD[((-8))+rsi]
+
 	lea	rsp,[rsi]
+
 $L$mul_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_mul_mont_gather5:
 
 ALIGN	32
@@ -429,16 +473,25 @@
 	mov	r9,QWORD[48+rsp]
 
 
-$L$mul4x_enter:
+
 DB	0x67
 	mov	rax,rsp
+
+$L$mul4x_enter:
 	push	rbx
+
 	push	rbp
+
 	push	r12
+
 	push	r13
+
 	push	r14
+
 	push	r15
 
+$L$mul4x_prologue:
+
 DB	0x67
 	shl	r9d,3
 	lea	r10,[r9*2+r9]
@@ -454,45 +507,72 @@
 
 
 	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
 	sub	r11,rdi
 	and	r11,4095
 	cmp	r10,r11
 	jb	NEAR $L$mul4xsp_alt
-	sub	rsp,r11
-	lea	rsp,[((-320))+r9*2+rsp]
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
 	jmp	NEAR $L$mul4xsp_done
 
 ALIGN	32
 $L$mul4xsp_alt:
 	lea	r10,[((4096-320))+r9*2]
-	lea	rsp,[((-320))+r9*2+rsp]
+	lea	rbp,[((-320))+r9*2+rbp]
 	sub	r11,r10
 	mov	r10,0
 	cmovc	r11,r10
-	sub	rsp,r11
+	sub	rbp,r11
 $L$mul4xsp_done:
-	and	rsp,-64
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+	jmp	NEAR $L$mul4x_page_walk_done
+
+$L$mul4x_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$mul4x_page_walk
+$L$mul4x_page_walk_done:
+
 	neg	r9
 
 	mov	QWORD[40+rsp],rax
+
 $L$mul4x_body:
 
 	call	mul4x_internal
 
 	mov	rsi,QWORD[40+rsp]
+
 	mov	rax,1
 
 	mov	r15,QWORD[((-48))+rsi]
+
 	mov	r14,QWORD[((-40))+rsi]
+
 	mov	r13,QWORD[((-32))+rsi]
+
 	mov	r12,QWORD[((-24))+rsi]
+
 	mov	rbp,QWORD[((-16))+rsi]
+
 	mov	rbx,QWORD[((-8))+rsi]
+
 	lea	rsp,[rsi]
+
 $L$mul4x_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_mul4x_mont_gather5:
 
 
@@ -1036,14 +1116,23 @@
 	mov	r9,QWORD[48+rsp]
 
 
+
 	mov	rax,rsp
+
 	push	rbx
+
 	push	rbp
+
 	push	r12
+
 	push	r13
+
 	push	r14
+
 	push	r15
 
+$L$power5_prologue:
+
 	shl	r9d,3
 	lea	r10d,[r9*2+r9]
 	neg	r9
@@ -1057,24 +1146,41 @@
 
 
 	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
 	sub	r11,rdi
 	and	r11,4095
 	cmp	r10,r11
 	jb	NEAR $L$pwr_sp_alt
-	sub	rsp,r11
-	lea	rsp,[((-320))+r9*2+rsp]
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
 	jmp	NEAR $L$pwr_sp_done
 
 ALIGN	32
 $L$pwr_sp_alt:
 	lea	r10,[((4096-320))+r9*2]
-	lea	rsp,[((-320))+r9*2+rsp]
+	lea	rbp,[((-320))+r9*2+rbp]
 	sub	r11,r10
 	mov	r10,0
 	cmovc	r11,r10
-	sub	rsp,r11
+	sub	rbp,r11
 $L$pwr_sp_done:
-	and	rsp,-64
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+	jmp	NEAR $L$pwr_page_walk_done
+
+$L$pwr_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$pwr_page_walk
+$L$pwr_page_walk_done:
+
 	mov	r10,r9
 	neg	r9
 
@@ -1089,6 +1195,7 @@
 
 	mov	QWORD[32+rsp],r8
 	mov	QWORD[40+rsp],rax
+
 $L$power5_body:
 DB	102,72,15,110,207
 DB	102,72,15,110,209
@@ -1115,18 +1222,27 @@
 	call	mul4x_internal
 
 	mov	rsi,QWORD[40+rsp]
+
 	mov	rax,1
 	mov	r15,QWORD[((-48))+rsi]
+
 	mov	r14,QWORD[((-40))+rsi]
+
 	mov	r13,QWORD[((-32))+rsi]
+
 	mov	r12,QWORD[((-24))+rsi]
+
 	mov	rbp,QWORD[((-16))+rsi]
+
 	mov	rbx,QWORD[((-8))+rsi]
+
 	lea	rsp,[rsi]
+
 $L$power5_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_power5:
 
 global	bn_sqr8x_internal
@@ -1989,15 +2105,24 @@
 	mov	r9,QWORD[48+rsp]
 
 
+
 DB	0x67
 	mov	rax,rsp
+
 	push	rbx
+
 	push	rbp
+
 	push	r12
+
 	push	r13
+
 	push	r14
+
 	push	r15
 
+$L$from_prologue:
+
 	shl	r9d,3
 	lea	r10,[r9*2+r9]
 	neg	r9
@@ -2011,24 +2136,41 @@
 
 
 	lea	r11,[((-320))+r9*2+rsp]
+	mov	rbp,rsp
 	sub	r11,rdi
 	and	r11,4095
 	cmp	r10,r11
 	jb	NEAR $L$from_sp_alt
-	sub	rsp,r11
-	lea	rsp,[((-320))+r9*2+rsp]
+	sub	rbp,r11
+	lea	rbp,[((-320))+r9*2+rbp]
 	jmp	NEAR $L$from_sp_done
 
 ALIGN	32
 $L$from_sp_alt:
 	lea	r10,[((4096-320))+r9*2]
-	lea	rsp,[((-320))+r9*2+rsp]
+	lea	rbp,[((-320))+r9*2+rbp]
 	sub	r11,r10
 	mov	r10,0
 	cmovc	r11,r10
-	sub	rsp,r11
+	sub	rbp,r11
 $L$from_sp_done:
-	and	rsp,-64
+	and	rbp,-64
+	mov	r11,rsp
+	sub	r11,rbp
+	and	r11,-4096
+	lea	rsp,[rbp*1+r11]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$from_page_walk
+	jmp	NEAR $L$from_page_walk_done
+
+$L$from_page_walk:
+	lea	rsp,[((-4096))+rsp]
+	mov	r10,QWORD[rsp]
+	cmp	rsp,rbp
+	ja	NEAR $L$from_page_walk
+$L$from_page_walk_done:
+
 	mov	r10,r9
 	neg	r9
 
@@ -2043,6 +2185,7 @@
 
 	mov	QWORD[32+rsp],r8
 	mov	QWORD[40+rsp],rax
+
 $L$from_body:
 	mov	r11,r9
 	lea	rax,[48+rsp]
@@ -2078,11 +2221,12 @@
 
 	pxor	xmm0,xmm0
 	lea	rax,[48+rsp]
-	mov	rsi,QWORD[40+rsp]
 	jmp	NEAR $L$from_mont_zero
 
 ALIGN	32
 $L$from_mont_zero:
+	mov	rsi,QWORD[40+rsp]
+
 	movdqa	XMMWORD[rax],xmm0
 	movdqa	XMMWORD[16+rax],xmm0
 	movdqa	XMMWORD[32+rax],xmm0
@@ -2093,16 +2237,24 @@
 
 	mov	rax,1
 	mov	r15,QWORD[((-48))+rsi]
+
 	mov	r14,QWORD[((-40))+rsi]
+
 	mov	r13,QWORD[((-32))+rsi]
+
 	mov	r12,QWORD[((-24))+rsi]
+
 	mov	rbp,QWORD[((-16))+rsi]
+
 	mov	rbx,QWORD[((-8))+rsi]
+
 	lea	rsp,[rsi]
+
 $L$from_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
+
 $L$SEH_end_bn_from_mont8x:
 global	bn_scatter5
 
@@ -2321,9 +2473,14 @@
 	cmp	rbx,r10
 	jb	NEAR $L$common_seh_tail
 
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_pop_regs
+
 	mov	rax,QWORD[152+r8]
 
-	mov	r10d,DWORD[4+r11]
+	mov	r10d,DWORD[8+r11]
 	lea	r10,[r10*1+rsi]
 	cmp	rbx,r10
 	jae	NEAR $L$common_seh_tail
@@ -2335,11 +2492,11 @@
 	mov	r10,QWORD[192+r8]
 	mov	rax,QWORD[8+r10*8+rax]
 
-	jmp	NEAR $L$body_proceed
+	jmp	NEAR $L$common_pop_regs
 
 $L$body_40:
 	mov	rax,QWORD[40+rax]
-$L$body_proceed:
+$L$common_pop_regs:
 	mov	rbx,QWORD[((-8))+rax]
 	mov	rbp,QWORD[((-16))+rax]
 	mov	r12,QWORD[((-24))+rax]
@@ -2419,22 +2576,22 @@
 $L$SEH_info_bn_mul_mont_gather5:
 DB	9,0,0,0
 	DD	mul_handler wrt ..imagebase
-	DD	$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
+	DD	$L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase
 ALIGN	8
 $L$SEH_info_bn_mul4x_mont_gather5:
 DB	9,0,0,0
 	DD	mul_handler wrt ..imagebase
-	DD	$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
+	DD	$L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase
 ALIGN	8
 $L$SEH_info_bn_power5:
 DB	9,0,0,0
 	DD	mul_handler wrt ..imagebase
-	DD	$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
+	DD	$L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase
 ALIGN	8
 $L$SEH_info_bn_from_mont8x:
 DB	9,0,0,0
 	DD	mul_handler wrt ..imagebase
-	DD	$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
+	DD	$L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase
 ALIGN	8
 $L$SEH_info_bn_gather5:
 DB	0x01,0x0b,0x03,0x0a
diff --git a/win-x86_64/crypto/chacha/chacha-x86_64.asm b/win-x86_64/crypto/chacha/chacha-x86_64.asm
index afebd2e..cb36246 100644
--- a/win-x86_64/crypto/chacha/chacha-x86_64.asm
+++ b/win-x86_64/crypto/chacha/chacha-x86_64.asm
@@ -27,6 +27,15 @@
 $L$sigma:
 DB	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107
 DB	0
+ALIGN	64
+$L$zeroz:
+	DD	0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0
+$L$fourz:
+	DD	4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0
+$L$incz:
+	DD	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+$L$sixteen:
+	DD	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
 DB	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
 DB	95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32
 DB	98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115
@@ -59,6 +68,7 @@
 	push	r14
 	push	r15
 	sub	rsp,64+24
+$L$ctr32_body:
 
 
 	movdqu	xmm1,XMMWORD[rcx]
@@ -296,13 +306,14 @@
 	jnz	NEAR $L$oop_tail
 
 $L$done:
-	add	rsp,64+24
-	pop	r15
-	pop	r14
-	pop	r13
-	pop	r12
-	pop	rbp
-	pop	rbx
+	lea	rsi,[((64+24+48))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$no_data:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -323,20 +334,15 @@
 
 
 $L$ChaCha20_ssse3:
+	mov	r9,rsp
 	cmp	rdx,128
 	ja	NEAR $L$ChaCha20_4x
 
 $L$do_sse3_after_all:
-	push	rbx
-	push	rbp
-	push	r12
-	push	r13
-	push	r14
-	push	r15
-
-	sub	rsp,64+72
-	movaps	XMMWORD[(64+32)+rsp],xmm6
-	movaps	XMMWORD[(64+48)+rsp],xmm7
+	sub	rsp,64+40
+	movaps	XMMWORD[(-40)+r9],xmm6
+	movaps	XMMWORD[(-24)+r9],xmm7
+$L$ssse3_body:
 	movdqa	xmm0,XMMWORD[$L$sigma]
 	movdqu	xmm1,XMMWORD[rcx]
 	movdqu	xmm2,XMMWORD[16+rcx]
@@ -348,7 +354,7 @@
 	movdqa	XMMWORD[16+rsp],xmm1
 	movdqa	XMMWORD[32+rsp],xmm2
 	movdqa	XMMWORD[48+rsp],xmm3
-	mov	ebp,10
+	mov	r8,10
 	jmp	NEAR $L$oop_ssse3
 
 ALIGN	32
@@ -358,7 +364,7 @@
 	movdqa	xmm1,XMMWORD[16+rsp]
 	movdqa	xmm2,XMMWORD[32+rsp]
 	paddd	xmm3,XMMWORD[48+rsp]
-	mov	ebp,10
+	mov	r8,10
 	movdqa	XMMWORD[48+rsp],xmm3
 	jmp	NEAR $L$oop_ssse3
 
@@ -407,7 +413,7 @@
 	pshufd	xmm2,xmm2,78
 	pshufd	xmm1,xmm1,147
 	pshufd	xmm3,xmm3,57
-	dec	ebp
+	dec	r8
 	jnz	NEAR $L$oop_ssse3
 	paddd	xmm0,XMMWORD[rsp]
 	paddd	xmm1,XMMWORD[16+rsp]
@@ -444,27 +450,22 @@
 	movdqa	XMMWORD[16+rsp],xmm1
 	movdqa	XMMWORD[32+rsp],xmm2
 	movdqa	XMMWORD[48+rsp],xmm3
-	xor	rbx,rbx
+	xor	r8,r8
 
 $L$oop_tail_ssse3:
-	movzx	eax,BYTE[rbx*1+rsi]
-	movzx	ecx,BYTE[rbx*1+rsp]
-	lea	rbx,[1+rbx]
+	movzx	eax,BYTE[r8*1+rsi]
+	movzx	ecx,BYTE[r8*1+rsp]
+	lea	r8,[1+r8]
 	xor	eax,ecx
-	mov	BYTE[((-1))+rbx*1+rdi],al
+	mov	BYTE[((-1))+r8*1+rdi],al
 	dec	rdx
 	jnz	NEAR $L$oop_tail_ssse3
 
 $L$done_ssse3:
-	movaps	xmm6,XMMWORD[((64+32))+rsp]
-	movaps	xmm7,XMMWORD[((64+48))+rsp]
-	add	rsp,64+72
-	pop	r15
-	pop	r14
-	pop	r13
-	pop	r12
-	pop	rbp
-	pop	rbx
+	movaps	xmm6,XMMWORD[((-40))+r9]
+	movaps	xmm7,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+$L$ssse3_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
@@ -484,6 +485,7 @@
 
 
 $L$ChaCha20_4x:
+	mov	r9,rsp
 	mov	r11,r10
 	shr	r10,32
 	test	r10,32
@@ -496,18 +498,18 @@
 	je	NEAR $L$do_sse3_after_all
 
 $L$proceed4x:
-	lea	r11,[((-120))+rsp]
-	sub	rsp,0x148+160
-	movaps	XMMWORD[(-48)+r11],xmm6
-	movaps	XMMWORD[(-32)+r11],xmm7
-	movaps	XMMWORD[(-16)+r11],xmm8
-	movaps	XMMWORD[r11],xmm9
-	movaps	XMMWORD[16+r11],xmm10
-	movaps	XMMWORD[32+r11],xmm11
-	movaps	XMMWORD[48+r11],xmm12
-	movaps	XMMWORD[64+r11],xmm13
-	movaps	XMMWORD[80+r11],xmm14
-	movaps	XMMWORD[96+r11],xmm15
+	sub	rsp,0x140+168
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$4x_body:
 	movdqa	xmm11,XMMWORD[$L$sigma]
 	movdqu	xmm15,XMMWORD[rcx]
 	movdqu	xmm7,XMMWORD[16+rcx]
@@ -1034,18 +1036,18 @@
 	jnz	NEAR $L$oop_tail4x
 
 $L$done4x:
-	lea	r11,[((320+48))+rsp]
-	movaps	xmm6,XMMWORD[((-48))+r11]
-	movaps	xmm7,XMMWORD[((-32))+r11]
-	movaps	xmm8,XMMWORD[((-16))+r11]
-	movaps	xmm9,XMMWORD[r11]
-	movaps	xmm10,XMMWORD[16+r11]
-	movaps	xmm11,XMMWORD[32+r11]
-	movaps	xmm12,XMMWORD[48+r11]
-	movaps	xmm13,XMMWORD[64+r11]
-	movaps	xmm14,XMMWORD[80+r11]
-	movaps	xmm15,XMMWORD[96+r11]
-	add	rsp,0x148+160
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+$L$4x_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
@@ -1065,22 +1067,21 @@
 
 
 $L$ChaCha20_8x:
-	mov	r10,rsp
-	sub	rsp,0x280+176
+	mov	r9,rsp
+	sub	rsp,0x280+168
 	and	rsp,-32
-	lea	r11,[((656+48))+rsp]
-	movaps	XMMWORD[(-48)+r11],xmm6
-	movaps	XMMWORD[(-32)+r11],xmm7
-	movaps	XMMWORD[(-16)+r11],xmm8
-	movaps	XMMWORD[r11],xmm9
-	movaps	XMMWORD[16+r11],xmm10
-	movaps	XMMWORD[32+r11],xmm11
-	movaps	XMMWORD[48+r11],xmm12
-	movaps	XMMWORD[64+r11],xmm13
-	movaps	XMMWORD[80+r11],xmm14
-	movaps	XMMWORD[96+r11],xmm15
+	movaps	XMMWORD[(-168)+r9],xmm6
+	movaps	XMMWORD[(-152)+r9],xmm7
+	movaps	XMMWORD[(-136)+r9],xmm8
+	movaps	XMMWORD[(-120)+r9],xmm9
+	movaps	XMMWORD[(-104)+r9],xmm10
+	movaps	XMMWORD[(-88)+r9],xmm11
+	movaps	XMMWORD[(-72)+r9],xmm12
+	movaps	XMMWORD[(-56)+r9],xmm13
+	movaps	XMMWORD[(-40)+r9],xmm14
+	movaps	XMMWORD[(-24)+r9],xmm15
+$L$8x_body:
 	vzeroupper
-	mov	QWORD[640+rsp],r10
 
 
 
@@ -1671,19 +1672,220 @@
 
 $L$done8x:
 	vzeroall
-	lea	r11,[((656+48))+rsp]
-	movaps	xmm6,XMMWORD[((-48))+r11]
-	movaps	xmm7,XMMWORD[((-32))+r11]
-	movaps	xmm8,XMMWORD[((-16))+r11]
-	movaps	xmm9,XMMWORD[r11]
-	movaps	xmm10,XMMWORD[16+r11]
-	movaps	xmm11,XMMWORD[32+r11]
-	movaps	xmm12,XMMWORD[48+r11]
-	movaps	xmm13,XMMWORD[64+r11]
-	movaps	xmm14,XMMWORD[80+r11]
-	movaps	xmm15,XMMWORD[96+r11]
-	mov	rsp,QWORD[640+rsp]
+	movaps	xmm6,XMMWORD[((-168))+r9]
+	movaps	xmm7,XMMWORD[((-152))+r9]
+	movaps	xmm8,XMMWORD[((-136))+r9]
+	movaps	xmm9,XMMWORD[((-120))+r9]
+	movaps	xmm10,XMMWORD[((-104))+r9]
+	movaps	xmm11,XMMWORD[((-88))+r9]
+	movaps	xmm12,XMMWORD[((-72))+r9]
+	movaps	xmm13,XMMWORD[((-56))+r9]
+	movaps	xmm14,XMMWORD[((-40))+r9]
+	movaps	xmm15,XMMWORD[((-24))+r9]
+	lea	rsp,[r9]
+$L$8x_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
 	DB	0F3h,0C3h		;repret
 $L$SEH_end_ChaCha20_8x:
+EXTERN	__imp_RtlVirtualUnwind
+
+ALIGN	16
+se_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	lea	r10,[$L$ctr32_body]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[152+r8]
+
+	lea	r10,[$L$no_data]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rax,[((64+24+48))+rax]
+
+	mov	rbx,QWORD[((-8))+rax]
+	mov	rbp,QWORD[((-16))+rax]
+	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
+	mov	QWORD[144+r8],rbx
+	mov	QWORD[160+r8],rbp
+	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
+
+$L$common_seh_tail:
+	mov	rdi,QWORD[8+rax]
+	mov	rsi,QWORD[16+rax]
+	mov	QWORD[152+r8],rax
+	mov	QWORD[168+r8],rsi
+	mov	QWORD[176+r8],rdi
+
+	mov	rdi,QWORD[40+r9]
+	mov	rsi,r8
+	mov	ecx,154
+	DD	0xa548f3fc
+
+	mov	rsi,r9
+	xor	rcx,rcx
+	mov	rdx,QWORD[8+rsi]
+	mov	r8,QWORD[rsi]
+	mov	r9,QWORD[16+rsi]
+	mov	r10,QWORD[40+rsi]
+	lea	r11,[56+rsi]
+	lea	r12,[24+rsi]
+	mov	QWORD[32+rsp],r10
+	mov	QWORD[40+rsp],r11
+	mov	QWORD[48+rsp],r12
+	mov	QWORD[56+rsp],rcx
+	call	QWORD[__imp_RtlVirtualUnwind]
+
+	mov	eax,1
+	add	rsp,64
+	popfq
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rbp
+	pop	rbx
+	pop	rdi
+	pop	rsi
+	DB	0F3h,0C3h		;repret
+
+
+
+ALIGN	16
+ssse3_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-40))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,4
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+
+ALIGN	16
+full_handler:
+	push	rsi
+	push	rdi
+	push	rbx
+	push	rbp
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	pushfq
+	sub	rsp,64
+
+	mov	rax,QWORD[120+r8]
+	mov	rbx,QWORD[248+r8]
+
+	mov	rsi,QWORD[8+r9]
+	mov	r11,QWORD[56+r9]
+
+	mov	r10d,DWORD[r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jb	NEAR $L$common_seh_tail
+
+	mov	rax,QWORD[192+r8]
+
+	mov	r10d,DWORD[4+r11]
+	lea	r10,[r10*1+rsi]
+	cmp	rbx,r10
+	jae	NEAR $L$common_seh_tail
+
+	lea	rsi,[((-168))+rax]
+	lea	rdi,[512+r8]
+	mov	ecx,20
+	DD	0xa548f3fc
+
+	jmp	NEAR $L$common_seh_tail
+
+
+section	.pdata rdata align=4
+ALIGN	4
+	DD	$L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ctr32 wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ctr32 wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_ssse3 wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_ssse3 wrt ..imagebase
+
+	DD	$L$SEH_begin_ChaCha20_4x wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_4x wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_4x wrt ..imagebase
+	DD	$L$SEH_begin_ChaCha20_8x wrt ..imagebase
+	DD	$L$SEH_end_ChaCha20_8x wrt ..imagebase
+	DD	$L$SEH_info_ChaCha20_8x wrt ..imagebase
+section	.xdata rdata align=8
+ALIGN	8
+$L$SEH_info_ChaCha20_ctr32:
+DB	9,0,0,0
+	DD	se_handler wrt ..imagebase
+
+$L$SEH_info_ChaCha20_ssse3:
+DB	9,0,0,0
+	DD	ssse3_handler wrt ..imagebase
+	DD	$L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase
+
+$L$SEH_info_ChaCha20_4x:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase
+$L$SEH_info_ChaCha20_8x:
+DB	9,0,0,0
+	DD	full_handler wrt ..imagebase
+	DD	$L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase
diff --git a/win-x86_64/crypto/modes/ghash-x86_64.asm b/win-x86_64/crypto/modes/ghash-x86_64.asm
index e5204bf..b01f98c 100644
--- a/win-x86_64/crypto/modes/ghash-x86_64.asm
+++ b/win-x86_64/crypto/modes/ghash-x86_64.asm
@@ -21,6 +21,10 @@
 	push	rbx
 	push	rbp
 	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp,280
 $L$gmult_prologue:
 
 	movzx	r8,BYTE[15+rdi]
@@ -97,8 +101,9 @@
 	mov	QWORD[8+rdi],r8
 	mov	QWORD[rdi],r9
 
-	mov	rbx,QWORD[16+rsp]
-	lea	rsp,[24+rsp]
+	lea	rsi,[((280+48))+rsp]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$gmult_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -669,14 +674,14 @@
 	mov	QWORD[8+rdi],r8
 	mov	QWORD[rdi],r9
 
-	lea	rsi,[280+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	lea	rsi,[((280+48))+rsp]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$ghash_epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -1916,14 +1921,20 @@
 	cmp	rbx,r10
 	jae	NEAR $L$in_prologue
 
-	lea	rax,[24+rax]
+	lea	rax,[((48+280))+rax]
 
 	mov	rbx,QWORD[((-8))+rax]
 	mov	rbp,QWORD[((-16))+rax]
 	mov	r12,QWORD[((-24))+rax]
+	mov	r13,QWORD[((-32))+rax]
+	mov	r14,QWORD[((-40))+rax]
+	mov	r15,QWORD[((-48))+rax]
 	mov	QWORD[144+r8],rbx
 	mov	QWORD[160+r8],rbp
 	mov	QWORD[216+r8],r12
+	mov	QWORD[224+r8],r13
+	mov	QWORD[232+r8],r14
+	mov	QWORD[240+r8],r15
 
 $L$in_prologue:
 	mov	rdi,QWORD[8+rax]
diff --git a/win-x86_64/crypto/sha/sha1-x86_64.asm b/win-x86_64/crypto/sha/sha1-x86_64.asm
index 168f78d..5484574 100644
--- a/win-x86_64/crypto/sha/sha1-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha1-x86_64.asm
@@ -1263,21 +1263,20 @@
 
 
 _ssse3_shortcut:
-	mov	rax,rsp
+	mov	r11,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	lea	rsp,[((-160))+rsp]
-	movaps	XMMWORD[(-40-96)+rax],xmm6
-	movaps	XMMWORD[(-40-80)+rax],xmm7
-	movaps	XMMWORD[(-40-64)+rax],xmm8
-	movaps	XMMWORD[(-40-48)+rax],xmm9
-	movaps	XMMWORD[(-40-32)+rax],xmm10
-	movaps	XMMWORD[(-40-16)+rax],xmm11
+	movaps	XMMWORD[(-40-96)+r11],xmm6
+	movaps	XMMWORD[(-40-80)+r11],xmm7
+	movaps	XMMWORD[(-40-64)+r11],xmm8
+	movaps	XMMWORD[(-40-48)+r11],xmm9
+	movaps	XMMWORD[(-40-32)+r11],xmm10
+	movaps	XMMWORD[(-40-16)+r11],xmm11
 $L$prologue_ssse3:
-	mov	r14,rax
 	and	rsp,-64
 	mov	r8,rdi
 	mov	r9,rsi
@@ -1285,7 +1284,7 @@
 
 	shl	r10,6
 	add	r10,r9
-	lea	r11,[((K_XX_XX+64))]
+	lea	r14,[((K_XX_XX+64))]
 
 	mov	eax,DWORD[r8]
 	mov	ebx,DWORD[4+r8]
@@ -1297,8 +1296,8 @@
 	xor	edi,edx
 	and	esi,edi
 
-	movdqa	xmm6,XMMWORD[64+r11]
-	movdqa	xmm9,XMMWORD[((-64))+r11]
+	movdqa	xmm6,XMMWORD[64+r14]
+	movdqa	xmm9,XMMWORD[((-64))+r14]
 	movdqu	xmm0,XMMWORD[r9]
 	movdqu	xmm1,XMMWORD[16+r9]
 	movdqu	xmm2,XMMWORD[32+r9]
@@ -1374,7 +1373,7 @@
 	pslld	xmm9,2
 	pxor	xmm4,xmm10
 	xor	edx,ebp
-	movdqa	xmm10,XMMWORD[((-64))+r11]
+	movdqa	xmm10,XMMWORD[((-64))+r14]
 	rol	ecx,5
 	add	ebx,edi
 	and	esi,edx
@@ -1435,7 +1434,7 @@
 	pslld	xmm10,2
 	pxor	xmm5,xmm8
 	xor	ebp,eax
-	movdqa	xmm8,XMMWORD[((-32))+r11]
+	movdqa	xmm8,XMMWORD[((-32))+r14]
 	rol	edx,5
 	add	ecx,edi
 	and	esi,ebp
@@ -1496,7 +1495,7 @@
 	pslld	xmm8,2
 	pxor	xmm6,xmm9
 	xor	eax,ebx
-	movdqa	xmm9,XMMWORD[((-32))+r11]
+	movdqa	xmm9,XMMWORD[((-32))+r14]
 	rol	ebp,5
 	add	edx,edi
 	and	esi,eax
@@ -1557,7 +1556,7 @@
 	pslld	xmm9,2
 	pxor	xmm7,xmm10
 	xor	ebx,ecx
-	movdqa	xmm10,XMMWORD[((-32))+r11]
+	movdqa	xmm10,XMMWORD[((-32))+r14]
 	rol	eax,5
 	add	ebp,edi
 	and	esi,ebx
@@ -1668,7 +1667,7 @@
 	pxor	xmm2,xmm3
 	add	eax,esi
 	xor	edi,edx
-	movdqa	xmm10,XMMWORD[r11]
+	movdqa	xmm10,XMMWORD[r14]
 	ror	ecx,7
 	paddd	xmm9,xmm1
 	add	eax,ebx
@@ -1903,7 +1902,7 @@
 	pxor	xmm7,xmm0
 	rol	ebx,5
 	add	eax,esi
-	movdqa	xmm9,XMMWORD[32+r11]
+	movdqa	xmm9,XMMWORD[32+r14]
 	xor	edi,ecx
 	paddd	xmm8,xmm6
 	xor	ecx,edx
@@ -2194,8 +2193,8 @@
 	add	ecx,edx
 	cmp	r9,r10
 	je	NEAR $L$done_ssse3
-	movdqa	xmm6,XMMWORD[64+r11]
-	movdqa	xmm9,XMMWORD[((-64))+r11]
+	movdqa	xmm6,XMMWORD[64+r14]
+	movdqa	xmm9,XMMWORD[((-64))+r14]
 	movdqu	xmm0,XMMWORD[r9]
 	movdqu	xmm1,XMMWORD[16+r9]
 	movdqu	xmm2,XMMWORD[32+r9]
@@ -2432,19 +2431,18 @@
 	mov	DWORD[8+r8],ecx
 	mov	DWORD[12+r8],edx
 	mov	DWORD[16+r8],ebp
-	movaps	xmm6,XMMWORD[((-40-96))+r14]
-	movaps	xmm7,XMMWORD[((-40-80))+r14]
-	movaps	xmm8,XMMWORD[((-40-64))+r14]
-	movaps	xmm9,XMMWORD[((-40-48))+r14]
-	movaps	xmm10,XMMWORD[((-40-32))+r14]
-	movaps	xmm11,XMMWORD[((-40-16))+r14]
-	lea	rsi,[r14]
-	mov	r14,QWORD[((-40))+rsi]
-	mov	r13,QWORD[((-32))+rsi]
-	mov	r12,QWORD[((-24))+rsi]
-	mov	rbp,QWORD[((-16))+rsi]
-	mov	rbx,QWORD[((-8))+rsi]
-	lea	rsp,[rsi]
+	movaps	xmm6,XMMWORD[((-40-96))+r11]
+	movaps	xmm7,XMMWORD[((-40-80))+r11]
+	movaps	xmm8,XMMWORD[((-40-64))+r11]
+	movaps	xmm9,XMMWORD[((-40-48))+r11]
+	movaps	xmm10,XMMWORD[((-40-32))+r11]
+	movaps	xmm11,XMMWORD[((-40-16))+r11]
+	mov	r14,QWORD[((-40))+r11]
+	mov	r13,QWORD[((-32))+r11]
+	mov	r12,QWORD[((-24))+r11]
+	mov	rbp,QWORD[((-16))+r11]
+	mov	rbx,QWORD[((-8))+r11]
+	lea	rsp,[r11]
 $L$epilogue_ssse3:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -2463,7 +2461,7 @@
 
 
 _avx_shortcut:
-	mov	rax,rsp
+	mov	r11,rsp
 	push	rbx
 	push	rbp
 	push	r12
@@ -2471,14 +2469,13 @@
 	push	r14
 	lea	rsp,[((-160))+rsp]
 	vzeroupper
-	vmovaps	XMMWORD[(-40-96)+rax],xmm6
-	vmovaps	XMMWORD[(-40-80)+rax],xmm7
-	vmovaps	XMMWORD[(-40-64)+rax],xmm8
-	vmovaps	XMMWORD[(-40-48)+rax],xmm9
-	vmovaps	XMMWORD[(-40-32)+rax],xmm10
-	vmovaps	XMMWORD[(-40-16)+rax],xmm11
+	vmovaps	XMMWORD[(-40-96)+r11],xmm6
+	vmovaps	XMMWORD[(-40-80)+r11],xmm7
+	vmovaps	XMMWORD[(-40-64)+r11],xmm8
+	vmovaps	XMMWORD[(-40-48)+r11],xmm9
+	vmovaps	XMMWORD[(-40-32)+r11],xmm10
+	vmovaps	XMMWORD[(-40-16)+r11],xmm11
 $L$prologue_avx:
-	mov	r14,rax
 	and	rsp,-64
 	mov	r8,rdi
 	mov	r9,rsi
@@ -2486,7 +2483,7 @@
 
 	shl	r10,6
 	add	r10,r9
-	lea	r11,[((K_XX_XX+64))]
+	lea	r14,[((K_XX_XX+64))]
 
 	mov	eax,DWORD[r8]
 	mov	ebx,DWORD[4+r8]
@@ -2498,8 +2495,8 @@
 	xor	edi,edx
 	and	esi,edi
 
-	vmovdqa	xmm6,XMMWORD[64+r11]
-	vmovdqa	xmm11,XMMWORD[((-64))+r11]
+	vmovdqa	xmm6,XMMWORD[64+r14]
+	vmovdqa	xmm11,XMMWORD[((-64))+r14]
 	vmovdqu	xmm0,XMMWORD[r9]
 	vmovdqu	xmm1,XMMWORD[16+r9]
 	vmovdqu	xmm2,XMMWORD[32+r9]
@@ -2624,7 +2621,7 @@
 	vpxor	xmm5,xmm5,xmm10
 	xor	ebp,eax
 	shld	edx,edx,5
-	vmovdqa	xmm11,XMMWORD[((-32))+r11]
+	vmovdqa	xmm11,XMMWORD[((-32))+r14]
 	add	ecx,edi
 	and	esi,ebp
 	xor	ebp,eax
@@ -2837,7 +2834,7 @@
 	add	eax,esi
 	xor	edi,edx
 	vpaddd	xmm9,xmm11,xmm1
-	vmovdqa	xmm11,XMMWORD[r11]
+	vmovdqa	xmm11,XMMWORD[r14]
 	shrd	ecx,ecx,7
 	add	eax,ebx
 	vpxor	xmm2,xmm2,xmm8
@@ -3056,7 +3053,7 @@
 	mov	edi,ebx
 	xor	esi,edx
 	vpaddd	xmm9,xmm11,xmm6
-	vmovdqa	xmm11,XMMWORD[32+r11]
+	vmovdqa	xmm11,XMMWORD[32+r14]
 	shld	ebx,ebx,5
 	add	eax,esi
 	vpxor	xmm7,xmm7,xmm8
@@ -3335,8 +3332,8 @@
 	add	ecx,edx
 	cmp	r9,r10
 	je	NEAR $L$done_avx
-	vmovdqa	xmm6,XMMWORD[64+r11]
-	vmovdqa	xmm11,XMMWORD[((-64))+r11]
+	vmovdqa	xmm6,XMMWORD[64+r14]
+	vmovdqa	xmm11,XMMWORD[((-64))+r14]
 	vmovdqu	xmm0,XMMWORD[r9]
 	vmovdqu	xmm1,XMMWORD[16+r9]
 	vmovdqu	xmm2,XMMWORD[32+r9]
@@ -3572,19 +3569,18 @@
 	mov	DWORD[8+r8],ecx
 	mov	DWORD[12+r8],edx
 	mov	DWORD[16+r8],ebp
-	movaps	xmm6,XMMWORD[((-40-96))+r14]
-	movaps	xmm7,XMMWORD[((-40-80))+r14]
-	movaps	xmm8,XMMWORD[((-40-64))+r14]
-	movaps	xmm9,XMMWORD[((-40-48))+r14]
-	movaps	xmm10,XMMWORD[((-40-32))+r14]
-	movaps	xmm11,XMMWORD[((-40-16))+r14]
-	lea	rsi,[r14]
-	mov	r14,QWORD[((-40))+rsi]
-	mov	r13,QWORD[((-32))+rsi]
-	mov	r12,QWORD[((-24))+rsi]
-	mov	rbp,QWORD[((-16))+rsi]
-	mov	rbx,QWORD[((-8))+rsi]
-	lea	rsp,[rsi]
+	movaps	xmm6,XMMWORD[((-40-96))+r11]
+	movaps	xmm7,XMMWORD[((-40-80))+r11]
+	movaps	xmm8,XMMWORD[((-40-64))+r11]
+	movaps	xmm9,XMMWORD[((-40-48))+r11]
+	movaps	xmm10,XMMWORD[((-40-32))+r11]
+	movaps	xmm11,XMMWORD[((-40-16))+r11]
+	mov	r14,QWORD[((-40))+r11]
+	mov	r13,QWORD[((-32))+r11]
+	mov	r12,QWORD[((-24))+r11]
+	mov	rbp,QWORD[((-16))+r11]
+	mov	rbx,QWORD[((-8))+r11]
+	lea	rsp,[r11]
 $L$epilogue_avx:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -3677,15 +3673,13 @@
 	cmp	rbx,r10
 	jb	NEAR $L$common_seh_tail
 
-	mov	rax,QWORD[152+r8]
+	mov	rax,QWORD[208+r8]
 
 	mov	r10d,DWORD[4+r11]
 	lea	r10,[r10*1+rsi]
 	cmp	rbx,r10
 	jae	NEAR $L$common_seh_tail
 
-	mov	rax,QWORD[232+r8]
-
 	lea	rsi,[((-40-96))+rax]
 	lea	rdi,[512+r8]
 	mov	ecx,12
diff --git a/win-x86_64/crypto/sha/sha256-x86_64.asm b/win-x86_64/crypto/sha/sha256-x86_64.asm
index efaf9b5..6e3d154 100644
--- a/win-x86_64/crypto/sha/sha256-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha256-x86_64.asm
@@ -30,13 +30,13 @@
 	je	NEAR $L$avx_shortcut
 	test	r10d,512
 	jnz	NEAR $L$ssse3_shortcut
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	push	r15
-	mov	r11,rsp
 	shl	rdx,4
 	sub	rsp,16*4+4*8
 	lea	rdx,[rdx*4+rsi]
@@ -44,7 +44,7 @@
 	mov	QWORD[((64+0))+rsp],rdi
 	mov	QWORD[((64+8))+rsp],rsi
 	mov	QWORD[((64+16))+rsp],rdx
-	mov	QWORD[((64+24))+rsp],r11
+	mov	QWORD[((64+24))+rsp],rax
 $L$prologue:
 
 	mov	eax,DWORD[rdi]
@@ -1709,13 +1709,13 @@
 	jb	NEAR $L$loop
 
 	mov	rsi,QWORD[((64+24))+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -1781,13 +1781,13 @@
 
 
 $L$ssse3_shortcut:
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	push	r15
-	mov	r11,rsp
 	shl	rdx,4
 	sub	rsp,160
 	lea	rdx,[rdx*4+rsi]
@@ -1795,7 +1795,7 @@
 	mov	QWORD[((64+0))+rsp],rdi
 	mov	QWORD[((64+8))+rsp],rsi
 	mov	QWORD[((64+16))+rsp],rdx
-	mov	QWORD[((64+24))+rsp],r11
+	mov	QWORD[((64+24))+rsp],rax
 	movaps	XMMWORD[(64+32)+rsp],xmm6
 	movaps	XMMWORD[(64+48)+rsp],xmm7
 	movaps	XMMWORD[(64+64)+rsp],xmm8
@@ -2870,13 +2870,13 @@
 	movaps	xmm7,XMMWORD[((64+48))+rsp]
 	movaps	xmm8,XMMWORD[((64+64))+rsp]
 	movaps	xmm9,XMMWORD[((64+80))+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$epilogue_ssse3:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -2895,13 +2895,13 @@
 
 
 $L$avx_shortcut:
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	push	r15
-	mov	r11,rsp
 	shl	rdx,4
 	sub	rsp,160
 	lea	rdx,[rdx*4+rsi]
@@ -2909,7 +2909,7 @@
 	mov	QWORD[((64+0))+rsp],rdi
 	mov	QWORD[((64+8))+rsp],rsi
 	mov	QWORD[((64+16))+rsp],rdx
-	mov	QWORD[((64+24))+rsp],r11
+	mov	QWORD[((64+24))+rsp],rax
 	movaps	XMMWORD[(64+32)+rsp],xmm6
 	movaps	XMMWORD[(64+48)+rsp],xmm7
 	movaps	XMMWORD[(64+64)+rsp],xmm8
@@ -3946,13 +3946,13 @@
 	movaps	xmm7,XMMWORD[((64+48))+rsp]
 	movaps	xmm8,XMMWORD[((64+64))+rsp]
 	movaps	xmm9,XMMWORD[((64+80))+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$epilogue_avx:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -3992,7 +3992,6 @@
 	jae	NEAR $L$in_prologue
 	mov	rsi,rax
 	mov	rax,QWORD[((64+24))+rax]
-	lea	rax,[48+rax]
 
 	mov	rbx,QWORD[((-8))+rax]
 	mov	rbp,QWORD[((-16))+rax]
diff --git a/win-x86_64/crypto/sha/sha512-x86_64.asm b/win-x86_64/crypto/sha/sha512-x86_64.asm
index 71449cd..d0d7a43 100644
--- a/win-x86_64/crypto/sha/sha512-x86_64.asm
+++ b/win-x86_64/crypto/sha/sha512-x86_64.asm
@@ -30,13 +30,13 @@
 	or	r10d,r9d
 	cmp	r10d,1342177792
 	je	NEAR $L$avx_shortcut
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	push	r15
-	mov	r11,rsp
 	shl	rdx,4
 	sub	rsp,16*8+4*8
 	lea	rdx,[rdx*8+rsi]
@@ -44,7 +44,7 @@
 	mov	QWORD[((128+0))+rsp],rdi
 	mov	QWORD[((128+8))+rsp],rsi
 	mov	QWORD[((128+16))+rsp],rdx
-	mov	QWORD[((128+24))+rsp],r11
+	mov	QWORD[((128+24))+rsp],rax
 $L$prologue:
 
 	mov	rax,QWORD[rdi]
@@ -1709,13 +1709,13 @@
 	jb	NEAR $L$loop
 
 	mov	rsi,QWORD[((128+24))+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$epilogue:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -1825,13 +1825,13 @@
 
 
 $L$xop_shortcut:
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	push	r15
-	mov	r11,rsp
 	shl	rdx,4
 	sub	rsp,256
 	lea	rdx,[rdx*8+rsi]
@@ -1839,7 +1839,7 @@
 	mov	QWORD[((128+0))+rsp],rdi
 	mov	QWORD[((128+8))+rsp],rsi
 	mov	QWORD[((128+16))+rsp],rdx
-	mov	QWORD[((128+24))+rsp],r11
+	mov	QWORD[((128+24))+rsp],rax
 	movaps	XMMWORD[(128+32)+rsp],xmm6
 	movaps	XMMWORD[(128+48)+rsp],xmm7
 	movaps	XMMWORD[(128+64)+rsp],xmm8
@@ -2906,13 +2906,13 @@
 	movaps	xmm9,XMMWORD[((128+80))+rsp]
 	movaps	xmm10,XMMWORD[((128+96))+rsp]
 	movaps	xmm11,XMMWORD[((128+112))+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$epilogue_xop:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -2931,13 +2931,13 @@
 
 
 $L$avx_shortcut:
+	mov	rax,rsp
 	push	rbx
 	push	rbp
 	push	r12
 	push	r13
 	push	r14
 	push	r15
-	mov	r11,rsp
 	shl	rdx,4
 	sub	rsp,256
 	lea	rdx,[rdx*8+rsi]
@@ -2945,7 +2945,7 @@
 	mov	QWORD[((128+0))+rsp],rdi
 	mov	QWORD[((128+8))+rsp],rsi
 	mov	QWORD[((128+16))+rsp],rdx
-	mov	QWORD[((128+24))+rsp],r11
+	mov	QWORD[((128+24))+rsp],rax
 	movaps	XMMWORD[(128+32)+rsp],xmm6
 	movaps	XMMWORD[(128+48)+rsp],xmm7
 	movaps	XMMWORD[(128+64)+rsp],xmm8
@@ -4076,13 +4076,13 @@
 	movaps	xmm9,XMMWORD[((128+80))+rsp]
 	movaps	xmm10,XMMWORD[((128+96))+rsp]
 	movaps	xmm11,XMMWORD[((128+112))+rsp]
-	mov	r15,QWORD[rsi]
-	mov	r14,QWORD[8+rsi]
-	mov	r13,QWORD[16+rsi]
-	mov	r12,QWORD[24+rsi]
-	mov	rbp,QWORD[32+rsi]
-	mov	rbx,QWORD[40+rsi]
-	lea	rsp,[48+rsi]
+	mov	r15,QWORD[((-48))+rsi]
+	mov	r14,QWORD[((-40))+rsi]
+	mov	r13,QWORD[((-32))+rsi]
+	mov	r12,QWORD[((-24))+rsi]
+	mov	rbp,QWORD[((-16))+rsi]
+	mov	rbx,QWORD[((-8))+rsi]
+	lea	rsp,[rsi]
 $L$epilogue_avx:
 	mov	rdi,QWORD[8+rsp]	;WIN64 epilogue
 	mov	rsi,QWORD[16+rsp]
@@ -4122,7 +4122,6 @@
 	jae	NEAR $L$in_prologue
 	mov	rsi,rax
 	mov	rax,QWORD[((128+24))+rax]
-	lea	rax,[48+rax]
 
 	mov	rbx,QWORD[((-8))+rax]
 	mov	rbp,QWORD[((-16))+rax]