crypto: serpent - add AVX2/x86_64 assembler implementation of serpent cipher

Patch adds AVX2/x86-64 implementation of Serpent cipher, requiring 16 parallel
blocks for input (256 bytes). Implementation is based on the AVX implementation
and extends to use the 256-bit wide YMM registers. Since serpent does not use
table look-ups, this implementation should be close to two times faster than
the AVX implementation.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 1ba48dd..9ad3d78 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1131,6 +1131,29 @@
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
 
+config CRYPTO_SERPENT_AVX2_X86_64
+	tristate "Serpent cipher algorithm (x86_64/AVX2)"
+	depends on X86 && 64BIT
+	select CRYPTO_ALGAPI
+	select CRYPTO_CRYPTD
+	select CRYPTO_ABLK_HELPER_X86
+	select CRYPTO_GLUE_HELPER_X86
+	select CRYPTO_SERPENT
+	select CRYPTO_SERPENT_AVX_X86_64
+	select CRYPTO_LRW
+	select CRYPTO_XTS
+	help
+	  Serpent cipher algorithm, by Anderson, Biham & Knudsen.
+
+	  Keys are allowed to be from 0 to 256 bits in length, in steps
+	  of 8 bits.
+
+	  This module provides Serpent cipher algorithm that processes 16
+	  blocks parallel using AVX2 instruction set.
+
+	  See also:
+	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>
+
 config CRYPTO_TEA
 	tristate "TEA, XTEA and XETA cipher algorithms"
 	select CRYPTO_ALGAPI
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index fea7841..f5e13de 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -1645,6 +1645,9 @@
 		.alg = "__cbc-serpent-avx",
 		.test = alg_test_null,
 	}, {
+		.alg = "__cbc-serpent-avx2",
+		.test = alg_test_null,
+	}, {
 		.alg = "__cbc-serpent-sse2",
 		.test = alg_test_null,
 	}, {
@@ -1673,6 +1676,9 @@
 		.alg = "__driver-cbc-serpent-avx",
 		.test = alg_test_null,
 	}, {
+		.alg = "__driver-cbc-serpent-avx2",
+		.test = alg_test_null,
+	}, {
 		.alg = "__driver-cbc-serpent-sse2",
 		.test = alg_test_null,
 	}, {
@@ -1701,6 +1707,9 @@
 		.alg = "__driver-ecb-serpent-avx",
 		.test = alg_test_null,
 	}, {
+		.alg = "__driver-ecb-serpent-avx2",
+		.test = alg_test_null,
+	}, {
 		.alg = "__driver-ecb-serpent-sse2",
 		.test = alg_test_null,
 	}, {
@@ -1969,6 +1978,9 @@
 		.alg = "cryptd(__driver-cbc-camellia-aesni)",
 		.test = alg_test_null,
 	}, {
+		.alg = "cryptd(__driver-cbc-serpent-avx2)",
+		.test = alg_test_null,
+	}, {
 		.alg = "cryptd(__driver-ecb-aes-aesni)",
 		.test = alg_test_null,
 		.fips_allowed = 1,
@@ -1988,6 +2000,9 @@
 		.alg = "cryptd(__driver-ecb-serpent-avx)",
 		.test = alg_test_null,
 	}, {
+		.alg = "cryptd(__driver-ecb-serpent-avx2)",
+		.test = alg_test_null,
+	}, {
 		.alg = "cryptd(__driver-ecb-serpent-sse2)",
 		.test = alg_test_null,
 	}, {