arm64/crypto: issue aese/aesmc instructions in pairs

This changes the AES core transform implementations to issue aese/aesmc
(and aesd/aesimc) in pairs. This enables a micro-architectural optimization
in recent Cortex-A5x cores that improves performance by 50-90%.

Measured performance in cycles per byte (Cortex-A57):

                CBC enc         CBC dec         CTR
  before        3.64            1.34            1.32
  after         1.95            0.85            0.93

Note that this results in a ~5% performance decrease for older cores.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
diff --git a/arch/arm64/crypto/aes-ce-ccm-core.S b/arch/arm64/crypto/aes-ce-ccm-core.S
index 432e484..a2a7fbc 100644
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@@ -101,19 +101,19 @@
 0:	mov	v4.16b, v3.16b
 1:	ld1	{v5.2d}, [x2], #16		/* load next round key */
 	aese	v0.16b, v4.16b
-	aese	v1.16b, v4.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v4.16b
 	aesmc	v1.16b, v1.16b
 2:	ld1	{v3.2d}, [x2], #16		/* load next round key */
 	aese	v0.16b, v5.16b
-	aese	v1.16b, v5.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v5.16b
 	aesmc	v1.16b, v1.16b
 3:	ld1	{v4.2d}, [x2], #16		/* load next round key */
 	subs	w3, w3, #3
 	aese	v0.16b, v3.16b
-	aese	v1.16b, v3.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v3.16b
 	aesmc	v1.16b, v1.16b
 	bpl	1b
 	aese	v0.16b, v4.16b
@@ -146,19 +146,19 @@
 	ld1	{v5.2d}, [x10], #16		/* load 2nd round key */
 2:	/* inner loop: 3 rounds, 2x interleaved */
 	aese	v0.16b, v4.16b
-	aese	v1.16b, v4.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v4.16b
 	aesmc	v1.16b, v1.16b
 3:	ld1	{v3.2d}, [x10], #16		/* load next round key */
 	aese	v0.16b, v5.16b
-	aese	v1.16b, v5.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v5.16b
 	aesmc	v1.16b, v1.16b
 4:	ld1	{v4.2d}, [x10], #16		/* load next round key */
 	subs	w7, w7, #3
 	aese	v0.16b, v3.16b
-	aese	v1.16b, v3.16b
 	aesmc	v0.16b, v0.16b
+	aese	v1.16b, v3.16b
 	aesmc	v1.16b, v1.16b
 	ld1	{v5.2d}, [x10], #16		/* load next round key */
 	bpl	2b