external/boringssl: Sync to 5e578c9dba73460c3eb17f771c77fc8e36f7812e.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/58e449904e248f34bdfc2be7a609c58bcb0257b7..5e578c9dba73460c3eb17f771c77fc8e36f7812e

Test: BoringSSL CTS Presubmits
Change-Id: Ic1541b034545fa58a284ca35134b3719303455c7
diff --git a/linux-aarch64/crypto/fipsmodule/sha512-armv8.S b/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
index 4645722..bb052b7 100644
--- a/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
+++ b/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -1,5 +1,46 @@
 #if defined(__aarch64__)
-#include <openssl/arm_arch.h>
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License").  You may not use
+// this file except in compliance with the License.  You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+//		SHA256-hw	SHA256(*)	SHA512
+// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
+// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
+// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
+// Denver	2.01		10.5 (+26%)	6.70 (+8%)
+// X-Gene			20.0 (+100%)	12.8 (+300%(***))
+// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
+//
+// (*)	Software SHA256 results are of lesser relevance, presented
+//	mostly for informational purposes.
+// (**)	The result is a trade-off: it's possible to improve it by
+//	10% (or by 1 cycle per round), but at the cost of 20% loss
+//	on Cortex-A53 (or by 4 cycles per round).
+// (***)	Super-impressive coefficients over gcc-generated code are
+//	indication of some compiler "pathology", most notably code
+//	generated with -mgeneral-regs-only is significanty faster
+//	and the gap is only 40-90%.
+
+#ifndef	__KERNEL__
+# include <openssl/arm_arch.h>
+#endif
 
 .text
 
@@ -1016,11 +1057,19 @@
 .quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
 .quad	0	// terminator
 .size	.LK512,.-.LK512
+#ifndef	__KERNEL__
 .align	3
 .LOPENSSL_armcap_P:
+# ifdef	__ILP32__
+.long	OPENSSL_armcap_P-.
+# else
 .quad	OPENSSL_armcap_P-.
+# endif
+#endif
 .byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 .align	2
 .align	2
+#ifndef	__KERNEL__
 .comm	OPENSSL_armcap_P,4,4
 #endif
+#endif