Ard Biesheuvel | fdd2389 | 2014-03-26 20:53:05 +0100 | [diff] [blame] | 1 | /* |
| 2 | * Accelerated GHASH implementation with ARMv8 PMULL instructions. |
| 3 | * |
| 4 | * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> |
| 5 | * |
| 6 | * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S |
| 7 | * |
| 8 | * Copyright (c) 2009 Intel Corp. |
| 9 | * Author: Huang Ying <ying.huang@intel.com> |
| 10 | * Vinodh Gopal |
| 11 | * Erdinc Ozturk |
| 12 | * Deniz Karakoyunlu |
| 13 | * |
| 14 | * This program is free software; you can redistribute it and/or modify it |
| 15 | * under the terms of the GNU General Public License version 2 as published |
| 16 | * by the Free Software Foundation. |
| 17 | */ |
| 18 | |
| 19 | #include <linux/linkage.h> |
| 20 | #include <asm/assembler.h> |
| 21 | |
| 22 | DATA .req v0 |
| 23 | SHASH .req v1 |
| 24 | IN1 .req v2 |
| 25 | T1 .req v2 |
| 26 | T2 .req v3 |
| 27 | T3 .req v4 |
| 28 | VZR .req v5 |
| 29 | |
| 30 | .text |
| 31 | .arch armv8-a+crypto |
| 32 | |
| 33 | /* |
| 34 | * void pmull_ghash_update(int blocks, u64 dg[], const char *src, |
| 35 | * struct ghash_key const *k, const char *head) |
| 36 | */ |
| 37 | ENTRY(pmull_ghash_update) |
| 38 | ld1 {DATA.16b}, [x1] |
| 39 | ld1 {SHASH.16b}, [x3] |
| 40 | eor VZR.16b, VZR.16b, VZR.16b |
| 41 | |
| 42 | /* do the head block first, if supplied */ |
| 43 | cbz x4, 0f |
| 44 | ld1 {IN1.2d}, [x4] |
| 45 | b 1f |
| 46 | |
| 47 | 0: ld1 {IN1.2d}, [x2], #16 |
| 48 | sub w0, w0, #1 |
| 49 | 1: ext IN1.16b, IN1.16b, IN1.16b, #8 |
| 50 | CPU_LE( rev64 IN1.16b, IN1.16b ) |
| 51 | eor DATA.16b, DATA.16b, IN1.16b |
| 52 | |
| 53 | /* multiply DATA by SHASH in GF(2^128) */ |
| 54 | ext T2.16b, DATA.16b, DATA.16b, #8 |
| 55 | ext T3.16b, SHASH.16b, SHASH.16b, #8 |
| 56 | eor T2.16b, T2.16b, DATA.16b |
| 57 | eor T3.16b, T3.16b, SHASH.16b |
| 58 | |
| 59 | pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 |
| 60 | pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 |
| 61 | pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) |
| 62 | eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) |
| 63 | eor T2.16b, T2.16b, DATA.16b |
| 64 | |
| 65 | ext T3.16b, VZR.16b, T2.16b, #8 |
| 66 | ext T2.16b, T2.16b, VZR.16b, #8 |
| 67 | eor DATA.16b, DATA.16b, T3.16b |
| 68 | eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of |
| 69 | // carry-less multiplication |
| 70 | |
| 71 | /* first phase of the reduction */ |
| 72 | shl T3.2d, DATA.2d, #1 |
| 73 | eor T3.16b, T3.16b, DATA.16b |
| 74 | shl T3.2d, T3.2d, #5 |
| 75 | eor T3.16b, T3.16b, DATA.16b |
| 76 | shl T3.2d, T3.2d, #57 |
| 77 | ext T2.16b, VZR.16b, T3.16b, #8 |
| 78 | ext T3.16b, T3.16b, VZR.16b, #8 |
| 79 | eor DATA.16b, DATA.16b, T2.16b |
| 80 | eor T1.16b, T1.16b, T3.16b |
| 81 | |
| 82 | /* second phase of the reduction */ |
| 83 | ushr T2.2d, DATA.2d, #5 |
| 84 | eor T2.16b, T2.16b, DATA.16b |
| 85 | ushr T2.2d, T2.2d, #1 |
| 86 | eor T2.16b, T2.16b, DATA.16b |
| 87 | ushr T2.2d, T2.2d, #1 |
| 88 | eor T1.16b, T1.16b, T2.16b |
| 89 | eor DATA.16b, DATA.16b, T1.16b |
| 90 | |
| 91 | cbnz w0, 0b |
| 92 | |
| 93 | st1 {DATA.16b}, [x1] |
| 94 | ret |
| 95 | ENDPROC(pmull_ghash_update) |