Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 1 | /* |
| 2 | * Bit sliced AES using NEON instructions |
| 3 | * |
| 4 | * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or modify |
| 7 | * it under the terms of the GNU General Public License version 2 as |
| 8 | * published by the Free Software Foundation. |
| 9 | */ |
| 10 | |
| 11 | /* |
| 12 | * The algorithm implemented here is described in detail by the paper |
| 13 | * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and |
| 14 | * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) |
| 15 | * |
| 16 | * This implementation is based primarily on the OpenSSL implementation |
| 17 | * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> |
| 18 | */ |
| 19 | |
| 20 | #include <linux/linkage.h> |
| 21 | #include <asm/assembler.h> |
| 22 | |
| 23 | .text |
| 24 | |
| 25 | rounds .req x11 |
| 26 | bskey .req x12 |
| 27 | |
| 28 | .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
| 29 | eor \b2, \b2, \b1 |
| 30 | eor \b5, \b5, \b6 |
| 31 | eor \b3, \b3, \b0 |
| 32 | eor \b6, \b6, \b2 |
| 33 | eor \b5, \b5, \b0 |
| 34 | eor \b6, \b6, \b3 |
| 35 | eor \b3, \b3, \b7 |
| 36 | eor \b7, \b7, \b5 |
| 37 | eor \b3, \b3, \b4 |
| 38 | eor \b4, \b4, \b5 |
| 39 | eor \b2, \b2, \b7 |
| 40 | eor \b3, \b3, \b1 |
| 41 | eor \b1, \b1, \b5 |
| 42 | .endm |
| 43 | |
| 44 | .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 |
| 45 | eor \b0, \b0, \b6 |
| 46 | eor \b1, \b1, \b4 |
| 47 | eor \b4, \b4, \b6 |
| 48 | eor \b2, \b2, \b0 |
| 49 | eor \b6, \b6, \b1 |
| 50 | eor \b1, \b1, \b5 |
| 51 | eor \b5, \b5, \b3 |
| 52 | eor \b3, \b3, \b7 |
| 53 | eor \b7, \b7, \b5 |
| 54 | eor \b2, \b2, \b5 |
| 55 | eor \b4, \b4, \b7 |
| 56 | .endm |
| 57 | |
| 58 | .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 |
| 59 | eor \b1, \b1, \b7 |
| 60 | eor \b4, \b4, \b7 |
| 61 | eor \b7, \b7, \b5 |
| 62 | eor \b1, \b1, \b3 |
| 63 | eor \b2, \b2, \b5 |
| 64 | eor \b3, \b3, \b7 |
| 65 | eor \b6, \b6, \b1 |
| 66 | eor \b2, \b2, \b0 |
| 67 | eor \b5, \b5, \b3 |
| 68 | eor \b4, \b4, \b6 |
| 69 | eor \b0, \b0, \b6 |
| 70 | eor \b1, \b1, \b4 |
| 71 | .endm |
| 72 | |
| 73 | .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 |
| 74 | eor \b1, \b1, \b5 |
| 75 | eor \b2, \b2, \b7 |
| 76 | eor \b3, \b3, \b1 |
| 77 | eor \b4, \b4, \b5 |
| 78 | eor \b7, \b7, \b5 |
| 79 | eor \b3, \b3, \b4 |
| 80 | eor \b5, \b5, \b0 |
| 81 | eor \b3, \b3, \b7 |
| 82 | eor \b6, \b6, \b2 |
| 83 | eor \b2, \b2, \b1 |
| 84 | eor \b6, \b6, \b3 |
| 85 | eor \b3, \b3, \b0 |
| 86 | eor \b5, \b5, \b6 |
| 87 | .endm |
| 88 | |
| 89 | .macro mul_gf4, x0, x1, y0, y1, t0, t1 |
| 90 | eor \t0, \y0, \y1 |
| 91 | and \t0, \t0, \x0 |
| 92 | eor \x0, \x0, \x1 |
| 93 | and \t1, \x1, \y0 |
| 94 | and \x0, \x0, \y1 |
| 95 | eor \x1, \t1, \t0 |
| 96 | eor \x0, \x0, \t1 |
| 97 | .endm |
| 98 | |
| 99 | .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 |
| 100 | eor \t0, \y0, \y1 |
| 101 | eor \t1, \y2, \y3 |
| 102 | and \t0, \t0, \x0 |
| 103 | and \t1, \t1, \x2 |
| 104 | eor \x0, \x0, \x1 |
| 105 | eor \x2, \x2, \x3 |
| 106 | and \x1, \x1, \y0 |
| 107 | and \x3, \x3, \y2 |
| 108 | and \x0, \x0, \y1 |
| 109 | and \x2, \x2, \y3 |
| 110 | eor \x1, \x1, \x0 |
| 111 | eor \x2, \x2, \x3 |
| 112 | eor \x0, \x0, \t0 |
| 113 | eor \x3, \x3, \t1 |
| 114 | .endm |
| 115 | |
| 116 | .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 117 | y0, y1, y2, y3, t0, t1, t2, t3 |
| 118 | eor \t0, \x0, \x2 |
| 119 | eor \t1, \x1, \x3 |
| 120 | mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 |
| 121 | eor \y0, \y0, \y2 |
| 122 | eor \y1, \y1, \y3 |
| 123 | mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 |
| 124 | eor \x0, \x0, \t0 |
| 125 | eor \x2, \x2, \t0 |
| 126 | eor \x1, \x1, \t1 |
| 127 | eor \x3, \x3, \t1 |
| 128 | eor \t0, \x4, \x6 |
| 129 | eor \t1, \x5, \x7 |
| 130 | mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 |
| 131 | eor \y0, \y0, \y2 |
| 132 | eor \y1, \y1, \y3 |
| 133 | mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 |
| 134 | eor \x4, \x4, \t0 |
| 135 | eor \x6, \x6, \t0 |
| 136 | eor \x5, \x5, \t1 |
| 137 | eor \x7, \x7, \t1 |
| 138 | .endm |
| 139 | |
| 140 | .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 141 | t0, t1, t2, t3, s0, s1, s2, s3 |
| 142 | eor \t3, \x4, \x6 |
| 143 | eor \t0, \x5, \x7 |
| 144 | eor \t1, \x1, \x3 |
| 145 | eor \s1, \x7, \x6 |
| 146 | eor \s0, \x0, \x2 |
| 147 | eor \s3, \t3, \t0 |
| 148 | orr \t2, \t0, \t1 |
| 149 | and \s2, \t3, \s0 |
| 150 | orr \t3, \t3, \s0 |
| 151 | eor \s0, \s0, \t1 |
| 152 | and \t0, \t0, \t1 |
| 153 | eor \t1, \x3, \x2 |
| 154 | and \s3, \s3, \s0 |
| 155 | and \s1, \s1, \t1 |
| 156 | eor \t1, \x4, \x5 |
| 157 | eor \s0, \x1, \x0 |
| 158 | eor \t3, \t3, \s1 |
| 159 | eor \t2, \t2, \s1 |
| 160 | and \s1, \t1, \s0 |
| 161 | orr \t1, \t1, \s0 |
| 162 | eor \t3, \t3, \s3 |
| 163 | eor \t0, \t0, \s1 |
| 164 | eor \t2, \t2, \s2 |
| 165 | eor \t1, \t1, \s3 |
| 166 | eor \t0, \t0, \s2 |
| 167 | and \s0, \x7, \x3 |
| 168 | eor \t1, \t1, \s2 |
| 169 | and \s1, \x6, \x2 |
| 170 | and \s2, \x5, \x1 |
| 171 | orr \s3, \x4, \x0 |
| 172 | eor \t3, \t3, \s0 |
| 173 | eor \t1, \t1, \s2 |
| 174 | eor \s0, \t0, \s3 |
| 175 | eor \t2, \t2, \s1 |
| 176 | and \s2, \t3, \t1 |
| 177 | eor \s1, \t2, \s2 |
| 178 | eor \s3, \s0, \s2 |
| 179 | bsl \s1, \t1, \s0 |
| 180 | not \t0, \s0 |
| 181 | bsl \s0, \s1, \s3 |
| 182 | bsl \t0, \s1, \s3 |
| 183 | bsl \s3, \t3, \t2 |
| 184 | eor \t3, \t3, \t2 |
| 185 | and \s2, \s0, \s3 |
| 186 | eor \t1, \t1, \t0 |
| 187 | eor \s2, \s2, \t3 |
| 188 | mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ |
| 189 | \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 |
| 190 | .endm |
| 191 | |
| 192 | .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ |
| 193 | t0, t1, t2, t3, s0, s1, s2, s3 |
| 194 | in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ |
| 195 | \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b |
| 196 | inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ |
| 197 | \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ |
| 198 | \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ |
| 199 | \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b |
| 200 | out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ |
| 201 | \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b |
| 202 | .endm |
| 203 | |
| 204 | .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ |
| 205 | t0, t1, t2, t3, s0, s1, s2, s3 |
| 206 | inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ |
| 207 | \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b |
| 208 | inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ |
| 209 | \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ |
| 210 | \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ |
| 211 | \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b |
| 212 | inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ |
| 213 | \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b |
| 214 | .endm |
| 215 | |
| 216 | .macro enc_next_rk |
| 217 | ldp q16, q17, [bskey], #128 |
| 218 | ldp q18, q19, [bskey, #-96] |
| 219 | ldp q20, q21, [bskey, #-64] |
| 220 | ldp q22, q23, [bskey, #-32] |
| 221 | .endm |
| 222 | |
| 223 | .macro dec_next_rk |
| 224 | ldp q16, q17, [bskey, #-128]! |
| 225 | ldp q18, q19, [bskey, #32] |
| 226 | ldp q20, q21, [bskey, #64] |
| 227 | ldp q22, q23, [bskey, #96] |
| 228 | .endm |
| 229 | |
| 230 | .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 |
| 231 | eor \x0\().16b, \x0\().16b, v16.16b |
| 232 | eor \x1\().16b, \x1\().16b, v17.16b |
| 233 | eor \x2\().16b, \x2\().16b, v18.16b |
| 234 | eor \x3\().16b, \x3\().16b, v19.16b |
| 235 | eor \x4\().16b, \x4\().16b, v20.16b |
| 236 | eor \x5\().16b, \x5\().16b, v21.16b |
| 237 | eor \x6\().16b, \x6\().16b, v22.16b |
| 238 | eor \x7\().16b, \x7\().16b, v23.16b |
| 239 | .endm |
| 240 | |
| 241 | .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask |
| 242 | tbl \x0\().16b, {\x0\().16b}, \mask\().16b |
| 243 | tbl \x1\().16b, {\x1\().16b}, \mask\().16b |
| 244 | tbl \x2\().16b, {\x2\().16b}, \mask\().16b |
| 245 | tbl \x3\().16b, {\x3\().16b}, \mask\().16b |
| 246 | tbl \x4\().16b, {\x4\().16b}, \mask\().16b |
| 247 | tbl \x5\().16b, {\x5\().16b}, \mask\().16b |
| 248 | tbl \x6\().16b, {\x6\().16b}, \mask\().16b |
| 249 | tbl \x7\().16b, {\x7\().16b}, \mask\().16b |
| 250 | .endm |
| 251 | |
| 252 | .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 253 | t0, t1, t2, t3, t4, t5, t6, t7, inv |
| 254 | ext \t0\().16b, \x0\().16b, \x0\().16b, #12 |
| 255 | ext \t1\().16b, \x1\().16b, \x1\().16b, #12 |
| 256 | eor \x0\().16b, \x0\().16b, \t0\().16b |
| 257 | ext \t2\().16b, \x2\().16b, \x2\().16b, #12 |
| 258 | eor \x1\().16b, \x1\().16b, \t1\().16b |
| 259 | ext \t3\().16b, \x3\().16b, \x3\().16b, #12 |
| 260 | eor \x2\().16b, \x2\().16b, \t2\().16b |
| 261 | ext \t4\().16b, \x4\().16b, \x4\().16b, #12 |
| 262 | eor \x3\().16b, \x3\().16b, \t3\().16b |
| 263 | ext \t5\().16b, \x5\().16b, \x5\().16b, #12 |
| 264 | eor \x4\().16b, \x4\().16b, \t4\().16b |
| 265 | ext \t6\().16b, \x6\().16b, \x6\().16b, #12 |
| 266 | eor \x5\().16b, \x5\().16b, \t5\().16b |
| 267 | ext \t7\().16b, \x7\().16b, \x7\().16b, #12 |
| 268 | eor \x6\().16b, \x6\().16b, \t6\().16b |
| 269 | eor \t1\().16b, \t1\().16b, \x0\().16b |
| 270 | eor \x7\().16b, \x7\().16b, \t7\().16b |
| 271 | ext \x0\().16b, \x0\().16b, \x0\().16b, #8 |
| 272 | eor \t2\().16b, \t2\().16b, \x1\().16b |
| 273 | eor \t0\().16b, \t0\().16b, \x7\().16b |
| 274 | eor \t1\().16b, \t1\().16b, \x7\().16b |
| 275 | ext \x1\().16b, \x1\().16b, \x1\().16b, #8 |
| 276 | eor \t5\().16b, \t5\().16b, \x4\().16b |
| 277 | eor \x0\().16b, \x0\().16b, \t0\().16b |
| 278 | eor \t6\().16b, \t6\().16b, \x5\().16b |
| 279 | eor \x1\().16b, \x1\().16b, \t1\().16b |
| 280 | ext \t0\().16b, \x4\().16b, \x4\().16b, #8 |
| 281 | eor \t4\().16b, \t4\().16b, \x3\().16b |
| 282 | ext \t1\().16b, \x5\().16b, \x5\().16b, #8 |
| 283 | eor \t7\().16b, \t7\().16b, \x6\().16b |
| 284 | ext \x4\().16b, \x3\().16b, \x3\().16b, #8 |
| 285 | eor \t3\().16b, \t3\().16b, \x2\().16b |
| 286 | ext \x5\().16b, \x7\().16b, \x7\().16b, #8 |
| 287 | eor \t4\().16b, \t4\().16b, \x7\().16b |
| 288 | ext \x3\().16b, \x6\().16b, \x6\().16b, #8 |
| 289 | eor \t3\().16b, \t3\().16b, \x7\().16b |
| 290 | ext \x6\().16b, \x2\().16b, \x2\().16b, #8 |
| 291 | eor \x7\().16b, \t1\().16b, \t5\().16b |
| 292 | .ifb \inv |
| 293 | eor \x2\().16b, \t0\().16b, \t4\().16b |
| 294 | eor \x4\().16b, \x4\().16b, \t3\().16b |
| 295 | eor \x5\().16b, \x5\().16b, \t7\().16b |
| 296 | eor \x3\().16b, \x3\().16b, \t6\().16b |
| 297 | eor \x6\().16b, \x6\().16b, \t2\().16b |
| 298 | .else |
| 299 | eor \t3\().16b, \t3\().16b, \x4\().16b |
| 300 | eor \x5\().16b, \x5\().16b, \t7\().16b |
| 301 | eor \x2\().16b, \x3\().16b, \t6\().16b |
| 302 | eor \x3\().16b, \t0\().16b, \t4\().16b |
| 303 | eor \x4\().16b, \x6\().16b, \t2\().16b |
| 304 | mov \x6\().16b, \t3\().16b |
| 305 | .endif |
| 306 | .endm |
| 307 | |
| 308 | .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ |
| 309 | t0, t1, t2, t3, t4, t5, t6, t7 |
| 310 | ext \t0\().16b, \x0\().16b, \x0\().16b, #8 |
| 311 | ext \t6\().16b, \x6\().16b, \x6\().16b, #8 |
| 312 | ext \t7\().16b, \x7\().16b, \x7\().16b, #8 |
| 313 | eor \t0\().16b, \t0\().16b, \x0\().16b |
| 314 | ext \t1\().16b, \x1\().16b, \x1\().16b, #8 |
| 315 | eor \t6\().16b, \t6\().16b, \x6\().16b |
| 316 | ext \t2\().16b, \x2\().16b, \x2\().16b, #8 |
| 317 | eor \t7\().16b, \t7\().16b, \x7\().16b |
| 318 | ext \t3\().16b, \x3\().16b, \x3\().16b, #8 |
| 319 | eor \t1\().16b, \t1\().16b, \x1\().16b |
| 320 | ext \t4\().16b, \x4\().16b, \x4\().16b, #8 |
| 321 | eor \t2\().16b, \t2\().16b, \x2\().16b |
| 322 | ext \t5\().16b, \x5\().16b, \x5\().16b, #8 |
| 323 | eor \t3\().16b, \t3\().16b, \x3\().16b |
| 324 | eor \t4\().16b, \t4\().16b, \x4\().16b |
| 325 | eor \t5\().16b, \t5\().16b, \x5\().16b |
| 326 | eor \x0\().16b, \x0\().16b, \t6\().16b |
| 327 | eor \x1\().16b, \x1\().16b, \t6\().16b |
| 328 | eor \x2\().16b, \x2\().16b, \t0\().16b |
| 329 | eor \x4\().16b, \x4\().16b, \t2\().16b |
| 330 | eor \x3\().16b, \x3\().16b, \t1\().16b |
| 331 | eor \x1\().16b, \x1\().16b, \t7\().16b |
| 332 | eor \x2\().16b, \x2\().16b, \t7\().16b |
| 333 | eor \x4\().16b, \x4\().16b, \t6\().16b |
| 334 | eor \x5\().16b, \x5\().16b, \t3\().16b |
| 335 | eor \x3\().16b, \x3\().16b, \t6\().16b |
| 336 | eor \x6\().16b, \x6\().16b, \t4\().16b |
| 337 | eor \x4\().16b, \x4\().16b, \t7\().16b |
| 338 | eor \x5\().16b, \x5\().16b, \t7\().16b |
| 339 | eor \x7\().16b, \x7\().16b, \t5\().16b |
| 340 | mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ |
| 341 | \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 |
| 342 | .endm |
| 343 | |
| 344 | .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 |
| 345 | ushr \t0\().2d, \b0\().2d, #\n |
| 346 | ushr \t1\().2d, \b1\().2d, #\n |
| 347 | eor \t0\().16b, \t0\().16b, \a0\().16b |
| 348 | eor \t1\().16b, \t1\().16b, \a1\().16b |
| 349 | and \t0\().16b, \t0\().16b, \mask\().16b |
| 350 | and \t1\().16b, \t1\().16b, \mask\().16b |
| 351 | eor \a0\().16b, \a0\().16b, \t0\().16b |
| 352 | shl \t0\().2d, \t0\().2d, #\n |
| 353 | eor \a1\().16b, \a1\().16b, \t1\().16b |
| 354 | shl \t1\().2d, \t1\().2d, #\n |
| 355 | eor \b0\().16b, \b0\().16b, \t0\().16b |
| 356 | eor \b1\().16b, \b1\().16b, \t1\().16b |
| 357 | .endm |
| 358 | |
| 359 | .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 |
| 360 | movi \t0\().16b, #0x55 |
| 361 | movi \t1\().16b, #0x33 |
| 362 | swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 |
| 363 | swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 |
| 364 | movi \t0\().16b, #0x0f |
| 365 | swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 |
| 366 | swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 |
| 367 | swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 |
| 368 | swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 |
| 369 | .endm |
| 370 | |
| 371 | |
| 372 | .align 6 |
| 373 | M0: .octa 0x0004080c0105090d02060a0e03070b0f |
| 374 | |
| 375 | M0SR: .octa 0x0004080c05090d010a0e02060f03070b |
| 376 | SR: .octa 0x0f0e0d0c0a09080b0504070600030201 |
| 377 | SRM0: .octa 0x01060b0c0207080d0304090e00050a0f |
| 378 | |
| 379 | M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 |
| 380 | ISR: .octa 0x0f0e0d0c080b0a090504070602010003 |
| 381 | ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f |
| 382 | |
| 383 | /* |
| 384 | * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) |
| 385 | */ |
| 386 | ENTRY(aesbs_convert_key) |
| 387 | ld1 {v7.4s}, [x1], #16 // load round 0 key |
| 388 | ld1 {v17.4s}, [x1], #16 // load round 1 key |
| 389 | |
| 390 | movi v8.16b, #0x01 // bit masks |
| 391 | movi v9.16b, #0x02 |
| 392 | movi v10.16b, #0x04 |
| 393 | movi v11.16b, #0x08 |
| 394 | movi v12.16b, #0x10 |
| 395 | movi v13.16b, #0x20 |
| 396 | movi v14.16b, #0x40 |
| 397 | movi v15.16b, #0x80 |
| 398 | ldr q16, M0 |
| 399 | |
| 400 | sub x2, x2, #1 |
| 401 | str q7, [x0], #16 // save round 0 key |
| 402 | |
| 403 | .Lkey_loop: |
| 404 | tbl v7.16b ,{v17.16b}, v16.16b |
| 405 | ld1 {v17.4s}, [x1], #16 // load next round key |
| 406 | |
| 407 | cmtst v0.16b, v7.16b, v8.16b |
| 408 | cmtst v1.16b, v7.16b, v9.16b |
| 409 | cmtst v2.16b, v7.16b, v10.16b |
| 410 | cmtst v3.16b, v7.16b, v11.16b |
| 411 | cmtst v4.16b, v7.16b, v12.16b |
| 412 | cmtst v5.16b, v7.16b, v13.16b |
| 413 | cmtst v6.16b, v7.16b, v14.16b |
| 414 | cmtst v7.16b, v7.16b, v15.16b |
| 415 | not v0.16b, v0.16b |
| 416 | not v1.16b, v1.16b |
| 417 | not v5.16b, v5.16b |
| 418 | not v6.16b, v6.16b |
| 419 | |
| 420 | subs x2, x2, #1 |
| 421 | stp q0, q1, [x0], #128 |
| 422 | stp q2, q3, [x0, #-96] |
| 423 | stp q4, q5, [x0, #-64] |
| 424 | stp q6, q7, [x0, #-32] |
| 425 | b.ne .Lkey_loop |
| 426 | |
| 427 | movi v7.16b, #0x63 // compose .L63 |
| 428 | eor v17.16b, v17.16b, v7.16b |
| 429 | str q17, [x0] |
| 430 | ret |
| 431 | ENDPROC(aesbs_convert_key) |
| 432 | |
| 433 | .align 4 |
| 434 | aesbs_encrypt8: |
| 435 | ldr q9, [bskey], #16 // round 0 key |
| 436 | ldr q8, M0SR |
| 437 | ldr q24, SR |
| 438 | |
| 439 | eor v10.16b, v0.16b, v9.16b // xor with round0 key |
| 440 | eor v11.16b, v1.16b, v9.16b |
| 441 | tbl v0.16b, {v10.16b}, v8.16b |
| 442 | eor v12.16b, v2.16b, v9.16b |
| 443 | tbl v1.16b, {v11.16b}, v8.16b |
| 444 | eor v13.16b, v3.16b, v9.16b |
| 445 | tbl v2.16b, {v12.16b}, v8.16b |
| 446 | eor v14.16b, v4.16b, v9.16b |
| 447 | tbl v3.16b, {v13.16b}, v8.16b |
| 448 | eor v15.16b, v5.16b, v9.16b |
| 449 | tbl v4.16b, {v14.16b}, v8.16b |
| 450 | eor v10.16b, v6.16b, v9.16b |
| 451 | tbl v5.16b, {v15.16b}, v8.16b |
| 452 | eor v11.16b, v7.16b, v9.16b |
| 453 | tbl v6.16b, {v10.16b}, v8.16b |
| 454 | tbl v7.16b, {v11.16b}, v8.16b |
| 455 | |
| 456 | bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 |
| 457 | |
| 458 | sub rounds, rounds, #1 |
| 459 | b .Lenc_sbox |
| 460 | |
| 461 | .Lenc_loop: |
| 462 | shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 |
| 463 | .Lenc_sbox: |
| 464 | sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ |
| 465 | v13, v14, v15 |
| 466 | subs rounds, rounds, #1 |
| 467 | b.cc .Lenc_done |
| 468 | |
| 469 | enc_next_rk |
| 470 | |
| 471 | mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ |
| 472 | v13, v14, v15 |
| 473 | |
| 474 | add_round_key v0, v1, v2, v3, v4, v5, v6, v7 |
| 475 | |
| 476 | b.ne .Lenc_loop |
| 477 | ldr q24, SRM0 |
| 478 | b .Lenc_loop |
| 479 | |
| 480 | .Lenc_done: |
| 481 | ldr q12, [bskey] // last round key |
| 482 | |
| 483 | bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 |
| 484 | |
| 485 | eor v0.16b, v0.16b, v12.16b |
| 486 | eor v1.16b, v1.16b, v12.16b |
| 487 | eor v4.16b, v4.16b, v12.16b |
| 488 | eor v6.16b, v6.16b, v12.16b |
| 489 | eor v3.16b, v3.16b, v12.16b |
| 490 | eor v7.16b, v7.16b, v12.16b |
| 491 | eor v2.16b, v2.16b, v12.16b |
| 492 | eor v5.16b, v5.16b, v12.16b |
| 493 | ret |
| 494 | ENDPROC(aesbs_encrypt8) |
| 495 | |
| 496 | .align 4 |
| 497 | aesbs_decrypt8: |
| 498 | lsl x9, rounds, #7 |
| 499 | add bskey, bskey, x9 |
| 500 | |
| 501 | ldr q9, [bskey, #-112]! // round 0 key |
| 502 | ldr q8, M0ISR |
| 503 | ldr q24, ISR |
| 504 | |
| 505 | eor v10.16b, v0.16b, v9.16b // xor with round0 key |
| 506 | eor v11.16b, v1.16b, v9.16b |
| 507 | tbl v0.16b, {v10.16b}, v8.16b |
| 508 | eor v12.16b, v2.16b, v9.16b |
| 509 | tbl v1.16b, {v11.16b}, v8.16b |
| 510 | eor v13.16b, v3.16b, v9.16b |
| 511 | tbl v2.16b, {v12.16b}, v8.16b |
| 512 | eor v14.16b, v4.16b, v9.16b |
| 513 | tbl v3.16b, {v13.16b}, v8.16b |
| 514 | eor v15.16b, v5.16b, v9.16b |
| 515 | tbl v4.16b, {v14.16b}, v8.16b |
| 516 | eor v10.16b, v6.16b, v9.16b |
| 517 | tbl v5.16b, {v15.16b}, v8.16b |
| 518 | eor v11.16b, v7.16b, v9.16b |
| 519 | tbl v6.16b, {v10.16b}, v8.16b |
| 520 | tbl v7.16b, {v11.16b}, v8.16b |
| 521 | |
| 522 | bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 |
| 523 | |
| 524 | sub rounds, rounds, #1 |
| 525 | b .Ldec_sbox |
| 526 | |
| 527 | .Ldec_loop: |
| 528 | shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 |
| 529 | .Ldec_sbox: |
| 530 | inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ |
| 531 | v13, v14, v15 |
| 532 | subs rounds, rounds, #1 |
| 533 | b.cc .Ldec_done |
| 534 | |
| 535 | dec_next_rk |
| 536 | |
| 537 | add_round_key v0, v1, v6, v4, v2, v7, v3, v5 |
| 538 | |
| 539 | inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ |
| 540 | v13, v14, v15 |
| 541 | |
| 542 | b.ne .Ldec_loop |
| 543 | ldr q24, ISRM0 |
| 544 | b .Ldec_loop |
| 545 | .Ldec_done: |
| 546 | ldr q12, [bskey, #-16] // last round key |
| 547 | |
| 548 | bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 |
| 549 | |
| 550 | eor v0.16b, v0.16b, v12.16b |
| 551 | eor v1.16b, v1.16b, v12.16b |
| 552 | eor v6.16b, v6.16b, v12.16b |
| 553 | eor v4.16b, v4.16b, v12.16b |
| 554 | eor v2.16b, v2.16b, v12.16b |
| 555 | eor v7.16b, v7.16b, v12.16b |
| 556 | eor v3.16b, v3.16b, v12.16b |
| 557 | eor v5.16b, v5.16b, v12.16b |
| 558 | ret |
| 559 | ENDPROC(aesbs_decrypt8) |
| 560 | |
| 561 | /* |
| 562 | * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 563 | * int blocks) |
| 564 | * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 565 | * int blocks) |
| 566 | */ |
| 567 | .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
| 568 | stp x29, x30, [sp, #-16]! |
| 569 | mov x29, sp |
| 570 | |
| 571 | 99: mov x5, #1 |
| 572 | lsl x5, x5, x4 |
| 573 | subs w4, w4, #8 |
| 574 | csel x4, x4, xzr, pl |
| 575 | csel x5, x5, xzr, mi |
| 576 | |
| 577 | ld1 {v0.16b}, [x1], #16 |
| 578 | tbnz x5, #1, 0f |
| 579 | ld1 {v1.16b}, [x1], #16 |
| 580 | tbnz x5, #2, 0f |
| 581 | ld1 {v2.16b}, [x1], #16 |
| 582 | tbnz x5, #3, 0f |
| 583 | ld1 {v3.16b}, [x1], #16 |
| 584 | tbnz x5, #4, 0f |
| 585 | ld1 {v4.16b}, [x1], #16 |
| 586 | tbnz x5, #5, 0f |
| 587 | ld1 {v5.16b}, [x1], #16 |
| 588 | tbnz x5, #6, 0f |
| 589 | ld1 {v6.16b}, [x1], #16 |
| 590 | tbnz x5, #7, 0f |
| 591 | ld1 {v7.16b}, [x1], #16 |
| 592 | |
| 593 | 0: mov bskey, x2 |
| 594 | mov rounds, x3 |
| 595 | bl \do8 |
| 596 | |
| 597 | st1 {\o0\().16b}, [x0], #16 |
| 598 | tbnz x5, #1, 1f |
| 599 | st1 {\o1\().16b}, [x0], #16 |
| 600 | tbnz x5, #2, 1f |
| 601 | st1 {\o2\().16b}, [x0], #16 |
| 602 | tbnz x5, #3, 1f |
| 603 | st1 {\o3\().16b}, [x0], #16 |
| 604 | tbnz x5, #4, 1f |
| 605 | st1 {\o4\().16b}, [x0], #16 |
| 606 | tbnz x5, #5, 1f |
| 607 | st1 {\o5\().16b}, [x0], #16 |
| 608 | tbnz x5, #6, 1f |
| 609 | st1 {\o6\().16b}, [x0], #16 |
| 610 | tbnz x5, #7, 1f |
| 611 | st1 {\o7\().16b}, [x0], #16 |
| 612 | |
| 613 | cbnz x4, 99b |
| 614 | |
| 615 | 1: ldp x29, x30, [sp], #16 |
| 616 | ret |
| 617 | .endm |
| 618 | |
| 619 | .align 4 |
| 620 | ENTRY(aesbs_ecb_encrypt) |
| 621 | __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 |
| 622 | ENDPROC(aesbs_ecb_encrypt) |
| 623 | |
| 624 | .align 4 |
| 625 | ENTRY(aesbs_ecb_decrypt) |
| 626 | __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 |
| 627 | ENDPROC(aesbs_ecb_decrypt) |
| 628 | |
| 629 | /* |
| 630 | * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 631 | * int blocks, u8 iv[]) |
| 632 | */ |
| 633 | .align 4 |
| 634 | ENTRY(aesbs_cbc_decrypt) |
| 635 | stp x29, x30, [sp, #-16]! |
| 636 | mov x29, sp |
| 637 | |
| 638 | 99: mov x6, #1 |
| 639 | lsl x6, x6, x4 |
| 640 | subs w4, w4, #8 |
| 641 | csel x4, x4, xzr, pl |
| 642 | csel x6, x6, xzr, mi |
| 643 | |
| 644 | ld1 {v0.16b}, [x1], #16 |
| 645 | mov v25.16b, v0.16b |
| 646 | tbnz x6, #1, 0f |
| 647 | ld1 {v1.16b}, [x1], #16 |
| 648 | mov v26.16b, v1.16b |
| 649 | tbnz x6, #2, 0f |
| 650 | ld1 {v2.16b}, [x1], #16 |
| 651 | mov v27.16b, v2.16b |
| 652 | tbnz x6, #3, 0f |
| 653 | ld1 {v3.16b}, [x1], #16 |
| 654 | mov v28.16b, v3.16b |
| 655 | tbnz x6, #4, 0f |
| 656 | ld1 {v4.16b}, [x1], #16 |
| 657 | mov v29.16b, v4.16b |
| 658 | tbnz x6, #5, 0f |
| 659 | ld1 {v5.16b}, [x1], #16 |
| 660 | mov v30.16b, v5.16b |
| 661 | tbnz x6, #6, 0f |
| 662 | ld1 {v6.16b}, [x1], #16 |
| 663 | mov v31.16b, v6.16b |
| 664 | tbnz x6, #7, 0f |
| 665 | ld1 {v7.16b}, [x1] |
| 666 | |
| 667 | 0: mov bskey, x2 |
| 668 | mov rounds, x3 |
| 669 | bl aesbs_decrypt8 |
| 670 | |
| 671 | ld1 {v24.16b}, [x5] // load IV |
| 672 | |
| 673 | eor v1.16b, v1.16b, v25.16b |
| 674 | eor v6.16b, v6.16b, v26.16b |
| 675 | eor v4.16b, v4.16b, v27.16b |
| 676 | eor v2.16b, v2.16b, v28.16b |
| 677 | eor v7.16b, v7.16b, v29.16b |
| 678 | eor v0.16b, v0.16b, v24.16b |
| 679 | eor v3.16b, v3.16b, v30.16b |
| 680 | eor v5.16b, v5.16b, v31.16b |
| 681 | |
| 682 | st1 {v0.16b}, [x0], #16 |
| 683 | mov v24.16b, v25.16b |
| 684 | tbnz x6, #1, 1f |
| 685 | st1 {v1.16b}, [x0], #16 |
| 686 | mov v24.16b, v26.16b |
| 687 | tbnz x6, #2, 1f |
| 688 | st1 {v6.16b}, [x0], #16 |
| 689 | mov v24.16b, v27.16b |
| 690 | tbnz x6, #3, 1f |
| 691 | st1 {v4.16b}, [x0], #16 |
| 692 | mov v24.16b, v28.16b |
| 693 | tbnz x6, #4, 1f |
| 694 | st1 {v2.16b}, [x0], #16 |
| 695 | mov v24.16b, v29.16b |
| 696 | tbnz x6, #5, 1f |
| 697 | st1 {v7.16b}, [x0], #16 |
| 698 | mov v24.16b, v30.16b |
| 699 | tbnz x6, #6, 1f |
| 700 | st1 {v3.16b}, [x0], #16 |
| 701 | mov v24.16b, v31.16b |
| 702 | tbnz x6, #7, 1f |
| 703 | ld1 {v24.16b}, [x1], #16 |
| 704 | st1 {v5.16b}, [x0], #16 |
| 705 | 1: st1 {v24.16b}, [x5] // store IV |
| 706 | |
| 707 | cbnz x4, 99b |
| 708 | |
| 709 | ldp x29, x30, [sp], #16 |
| 710 | ret |
| 711 | ENDPROC(aesbs_cbc_decrypt) |
| 712 | |
| 713 | .macro next_tweak, out, in, const, tmp |
| 714 | sshr \tmp\().2d, \in\().2d, #63 |
| 715 | and \tmp\().16b, \tmp\().16b, \const\().16b |
| 716 | add \out\().2d, \in\().2d, \in\().2d |
| 717 | ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 |
| 718 | eor \out\().16b, \out\().16b, \tmp\().16b |
| 719 | .endm |
| 720 | |
| 721 | .align 4 |
| 722 | .Lxts_mul_x: |
| 723 | CPU_LE( .quad 1, 0x87 ) |
| 724 | CPU_BE( .quad 0x87, 1 ) |
| 725 | |
| 726 | /* |
| 727 | * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 728 | * int blocks, u8 iv[]) |
| 729 | * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, |
| 730 | * int blocks, u8 iv[]) |
| 731 | */ |
| 732 | __xts_crypt8: |
| 733 | mov x6, #1 |
| 734 | lsl x6, x6, x4 |
| 735 | subs w4, w4, #8 |
| 736 | csel x4, x4, xzr, pl |
| 737 | csel x6, x6, xzr, mi |
| 738 | |
| 739 | ld1 {v0.16b}, [x1], #16 |
| 740 | next_tweak v26, v25, v30, v31 |
| 741 | eor v0.16b, v0.16b, v25.16b |
| 742 | tbnz x6, #1, 0f |
| 743 | |
| 744 | ld1 {v1.16b}, [x1], #16 |
| 745 | next_tweak v27, v26, v30, v31 |
| 746 | eor v1.16b, v1.16b, v26.16b |
| 747 | tbnz x6, #2, 0f |
| 748 | |
| 749 | ld1 {v2.16b}, [x1], #16 |
| 750 | next_tweak v28, v27, v30, v31 |
| 751 | eor v2.16b, v2.16b, v27.16b |
| 752 | tbnz x6, #3, 0f |
| 753 | |
| 754 | ld1 {v3.16b}, [x1], #16 |
| 755 | next_tweak v29, v28, v30, v31 |
| 756 | eor v3.16b, v3.16b, v28.16b |
| 757 | tbnz x6, #4, 0f |
| 758 | |
| 759 | ld1 {v4.16b}, [x1], #16 |
| 760 | str q29, [sp, #16] |
| 761 | eor v4.16b, v4.16b, v29.16b |
| 762 | next_tweak v29, v29, v30, v31 |
| 763 | tbnz x6, #5, 0f |
| 764 | |
| 765 | ld1 {v5.16b}, [x1], #16 |
| 766 | str q29, [sp, #32] |
| 767 | eor v5.16b, v5.16b, v29.16b |
| 768 | next_tweak v29, v29, v30, v31 |
| 769 | tbnz x6, #6, 0f |
| 770 | |
| 771 | ld1 {v6.16b}, [x1], #16 |
| 772 | str q29, [sp, #48] |
| 773 | eor v6.16b, v6.16b, v29.16b |
| 774 | next_tweak v29, v29, v30, v31 |
| 775 | tbnz x6, #7, 0f |
| 776 | |
| 777 | ld1 {v7.16b}, [x1], #16 |
| 778 | str q29, [sp, #64] |
| 779 | eor v7.16b, v7.16b, v29.16b |
| 780 | next_tweak v29, v29, v30, v31 |
| 781 | |
| 782 | 0: mov bskey, x2 |
| 783 | mov rounds, x3 |
| 784 | br x7 |
| 785 | ENDPROC(__xts_crypt8) |
| 786 | |
| 787 | .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 |
| 788 | stp x29, x30, [sp, #-80]! |
| 789 | mov x29, sp |
| 790 | |
| 791 | ldr q30, .Lxts_mul_x |
| 792 | ld1 {v25.16b}, [x5] |
| 793 | |
| 794 | 99: adr x7, \do8 |
| 795 | bl __xts_crypt8 |
| 796 | |
| 797 | ldp q16, q17, [sp, #16] |
| 798 | ldp q18, q19, [sp, #48] |
| 799 | |
| 800 | eor \o0\().16b, \o0\().16b, v25.16b |
| 801 | eor \o1\().16b, \o1\().16b, v26.16b |
| 802 | eor \o2\().16b, \o2\().16b, v27.16b |
| 803 | eor \o3\().16b, \o3\().16b, v28.16b |
| 804 | |
| 805 | st1 {\o0\().16b}, [x0], #16 |
| 806 | mov v25.16b, v26.16b |
| 807 | tbnz x6, #1, 1f |
| 808 | st1 {\o1\().16b}, [x0], #16 |
| 809 | mov v25.16b, v27.16b |
| 810 | tbnz x6, #2, 1f |
| 811 | st1 {\o2\().16b}, [x0], #16 |
| 812 | mov v25.16b, v28.16b |
| 813 | tbnz x6, #3, 1f |
| 814 | st1 {\o3\().16b}, [x0], #16 |
| 815 | mov v25.16b, v29.16b |
| 816 | tbnz x6, #4, 1f |
| 817 | |
| 818 | eor \o4\().16b, \o4\().16b, v16.16b |
| 819 | eor \o5\().16b, \o5\().16b, v17.16b |
| 820 | eor \o6\().16b, \o6\().16b, v18.16b |
| 821 | eor \o7\().16b, \o7\().16b, v19.16b |
| 822 | |
| 823 | st1 {\o4\().16b}, [x0], #16 |
| 824 | tbnz x6, #5, 1f |
| 825 | st1 {\o5\().16b}, [x0], #16 |
| 826 | tbnz x6, #6, 1f |
| 827 | st1 {\o6\().16b}, [x0], #16 |
| 828 | tbnz x6, #7, 1f |
| 829 | st1 {\o7\().16b}, [x0], #16 |
| 830 | |
| 831 | cbnz x4, 99b |
| 832 | |
| 833 | 1: st1 {v25.16b}, [x5] |
| 834 | ldp x29, x30, [sp], #80 |
| 835 | ret |
| 836 | .endm |
| 837 | |
| 838 | ENTRY(aesbs_xts_encrypt) |
| 839 | __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 |
| 840 | ENDPROC(aesbs_xts_encrypt) |
| 841 | |
| 842 | ENTRY(aesbs_xts_decrypt) |
| 843 | __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 |
| 844 | ENDPROC(aesbs_xts_decrypt) |
| 845 | |
| 846 | .macro next_ctr, v |
| 847 | mov \v\().d[1], x8 |
| 848 | adds x8, x8, #1 |
| 849 | mov \v\().d[0], x7 |
| 850 | adc x7, x7, xzr |
| 851 | rev64 \v\().16b, \v\().16b |
| 852 | .endm |
| 853 | |
| 854 | /* |
| 855 | * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 856 | * int rounds, int blocks, u8 iv[], u8 final[]) |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 857 | */ |
| 858 | ENTRY(aesbs_ctr_encrypt) |
| 859 | stp x29, x30, [sp, #-16]! |
| 860 | mov x29, sp |
| 861 | |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 862 | cmp x6, #0 |
| 863 | cset x10, ne |
| 864 | add x4, x4, x10 // do one extra block if final |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 865 | |
| 866 | ldp x7, x8, [x5] |
| 867 | ld1 {v0.16b}, [x5] |
| 868 | CPU_LE( rev x7, x7 ) |
| 869 | CPU_LE( rev x8, x8 ) |
| 870 | adds x8, x8, #1 |
| 871 | adc x7, x7, xzr |
| 872 | |
| 873 | 99: mov x9, #1 |
| 874 | lsl x9, x9, x4 |
| 875 | subs w4, w4, #8 |
| 876 | csel x4, x4, xzr, pl |
| 877 | csel x9, x9, xzr, le |
| 878 | |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 879 | tbnz x9, #1, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 880 | next_ctr v1 |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 881 | tbnz x9, #2, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 882 | next_ctr v2 |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 883 | tbnz x9, #3, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 884 | next_ctr v3 |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 885 | tbnz x9, #4, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 886 | next_ctr v4 |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 887 | tbnz x9, #5, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 888 | next_ctr v5 |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 889 | tbnz x9, #6, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 890 | next_ctr v6 |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 891 | tbnz x9, #7, 0f |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 892 | next_ctr v7 |
| 893 | |
| 894 | 0: mov bskey, x2 |
| 895 | mov rounds, x3 |
| 896 | bl aesbs_encrypt8 |
| 897 | |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 898 | lsr x9, x9, x10 // disregard the extra block |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 899 | tbnz x9, #0, 0f |
| 900 | |
| 901 | ld1 {v8.16b}, [x1], #16 |
| 902 | eor v0.16b, v0.16b, v8.16b |
| 903 | st1 {v0.16b}, [x0], #16 |
| 904 | tbnz x9, #1, 1f |
| 905 | |
| 906 | ld1 {v9.16b}, [x1], #16 |
| 907 | eor v1.16b, v1.16b, v9.16b |
| 908 | st1 {v1.16b}, [x0], #16 |
| 909 | tbnz x9, #2, 2f |
| 910 | |
| 911 | ld1 {v10.16b}, [x1], #16 |
| 912 | eor v4.16b, v4.16b, v10.16b |
| 913 | st1 {v4.16b}, [x0], #16 |
| 914 | tbnz x9, #3, 3f |
| 915 | |
| 916 | ld1 {v11.16b}, [x1], #16 |
| 917 | eor v6.16b, v6.16b, v11.16b |
| 918 | st1 {v6.16b}, [x0], #16 |
| 919 | tbnz x9, #4, 4f |
| 920 | |
| 921 | ld1 {v12.16b}, [x1], #16 |
| 922 | eor v3.16b, v3.16b, v12.16b |
| 923 | st1 {v3.16b}, [x0], #16 |
| 924 | tbnz x9, #5, 5f |
| 925 | |
| 926 | ld1 {v13.16b}, [x1], #16 |
| 927 | eor v7.16b, v7.16b, v13.16b |
| 928 | st1 {v7.16b}, [x0], #16 |
| 929 | tbnz x9, #6, 6f |
| 930 | |
| 931 | ld1 {v14.16b}, [x1], #16 |
| 932 | eor v2.16b, v2.16b, v14.16b |
| 933 | st1 {v2.16b}, [x0], #16 |
| 934 | tbnz x9, #7, 7f |
| 935 | |
| 936 | ld1 {v15.16b}, [x1], #16 |
| 937 | eor v5.16b, v5.16b, v15.16b |
| 938 | st1 {v5.16b}, [x0], #16 |
| 939 | |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 940 | 8: next_ctr v0 |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 941 | cbnz x4, 99b |
| 942 | |
| 943 | 0: st1 {v0.16b}, [x5] |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 944 | ldp x29, x30, [sp], #16 |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 945 | ret |
| 946 | |
| 947 | /* |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 948 | * If we are handling the tail of the input (x6 != NULL), return the |
| 949 | * final keystream block back to the caller. |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 950 | */ |
| 951 | 1: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 952 | st1 {v1.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 953 | b 8b |
| 954 | 2: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 955 | st1 {v4.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 956 | b 8b |
| 957 | 3: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 958 | st1 {v6.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 959 | b 8b |
| 960 | 4: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 961 | st1 {v3.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 962 | b 8b |
| 963 | 5: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 964 | st1 {v7.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 965 | b 8b |
| 966 | 6: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 967 | st1 {v2.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 968 | b 8b |
| 969 | 7: cbz x6, 8b |
Ard Biesheuvel | 88a3f58 | 2017-02-02 11:38:55 +0000 | [diff] [blame] | 970 | st1 {v5.16b}, [x6] |
Ard Biesheuvel | 1abee99 | 2017-01-11 16:41:55 +0000 | [diff] [blame] | 971 | b 8b |
| 972 | ENDPROC(aesbs_ctr_encrypt) |