Robert Sloan | 8967815 | 2019-03-12 14:24:00 -0700 | [diff] [blame^] | 1 | // This file is generated from a similarly-named Perl script in the BoringSSL |
| 2 | // source tree. Do not edit by hand. |
| 3 | |
| 4 | #if defined(__has_feature) |
| 5 | #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) |
| 6 | #define OPENSSL_NO_ASM |
| 7 | #endif |
| 8 | #endif |
| 9 | |
| 10 | #if !defined(OPENSSL_NO_ASM) |
| 11 | #if defined(BORINGSSL_PREFIX) |
| 12 | #include <boringssl_prefix_symbols_asm.h> |
| 13 | #endif |
| 14 | .section __TEXT,__const |
| 15 | |
| 16 | |
| 17 | .align 7 // totally strategic alignment |
| 18 | _vpaes_consts: |
| 19 | Lk_mc_forward: // mc_forward |
| 20 | .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 |
| 21 | .quad 0x080B0A0904070605, 0x000302010C0F0E0D |
| 22 | .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 |
| 23 | .quad 0x000302010C0F0E0D, 0x080B0A0904070605 |
| 24 | Lk_mc_backward: // mc_backward |
| 25 | .quad 0x0605040702010003, 0x0E0D0C0F0A09080B |
| 26 | .quad 0x020100030E0D0C0F, 0x0A09080B06050407 |
| 27 | .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 |
| 28 | .quad 0x0A09080B06050407, 0x020100030E0D0C0F |
| 29 | Lk_sr: // sr |
| 30 | .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 |
| 31 | .quad 0x030E09040F0A0500, 0x0B06010C07020D08 |
| 32 | .quad 0x0F060D040B020900, 0x070E050C030A0108 |
| 33 | .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 |
| 34 | |
| 35 | // |
| 36 | // "Hot" constants |
| 37 | // |
| 38 | Lk_inv: // inv, inva |
| 39 | .quad 0x0E05060F0D080180, 0x040703090A0B0C02 |
| 40 | .quad 0x01040A060F0B0780, 0x030D0E0C02050809 |
| 41 | Lk_ipt: // input transform (lo, hi) |
| 42 | .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 |
| 43 | .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 |
| 44 | Lk_sbo: // sbou, sbot |
| 45 | .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 |
| 46 | .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA |
| 47 | Lk_sb1: // sb1u, sb1t |
| 48 | .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF |
| 49 | .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 |
| 50 | Lk_sb2: // sb2u, sb2t |
| 51 | .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A |
| 52 | .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD |
| 53 | |
| 54 | // |
| 55 | // Decryption stuff |
| 56 | // |
| 57 | Lk_dipt: // decryption input transform |
| 58 | .quad 0x0F505B040B545F00, 0x154A411E114E451A |
| 59 | .quad 0x86E383E660056500, 0x12771772F491F194 |
| 60 | Lk_dsbo: // decryption sbox final output |
| 61 | .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D |
| 62 | .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C |
| 63 | Lk_dsb9: // decryption sbox output *9*u, *9*t |
| 64 | .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 |
| 65 | .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 |
| 66 | Lk_dsbd: // decryption sbox output *D*u, *D*t |
| 67 | .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 |
| 68 | .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 |
| 69 | Lk_dsbb: // decryption sbox output *B*u, *B*t |
| 70 | .quad 0xD022649296B44200, 0x602646F6B0F2D404 |
| 71 | .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B |
| 72 | Lk_dsbe: // decryption sbox output *E*u, *E*t |
| 73 | .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 |
| 74 | .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 |
| 75 | |
| 76 | // |
| 77 | // Key schedule constants |
| 78 | // |
| 79 | Lk_dksd: // decryption key schedule: invskew x*D |
| 80 | .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 |
| 81 | .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E |
| 82 | Lk_dksb: // decryption key schedule: invskew x*B |
| 83 | .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 |
| 84 | .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 |
| 85 | Lk_dkse: // decryption key schedule: invskew x*E + 0x63 |
| 86 | .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 |
| 87 | .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 |
| 88 | Lk_dks9: // decryption key schedule: invskew x*9 |
| 89 | .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC |
| 90 | .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE |
| 91 | |
| 92 | Lk_rcon: // rcon |
| 93 | .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 |
| 94 | |
| 95 | Lk_opt: // output transform |
| 96 | .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 |
| 97 | .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 |
| 98 | Lk_deskew: // deskew tables: inverts the sbox's "skew" |
| 99 | .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A |
| 100 | .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 |
| 101 | |
| 102 | .byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 |
| 103 | .align 2 |
| 104 | |
| 105 | .align 6 |
| 106 | |
| 107 | .text |
| 108 | ## |
| 109 | ## _aes_preheat |
| 110 | ## |
| 111 | ## Fills register %r10 -> .aes_consts (so you can -fPIC) |
| 112 | ## and %xmm9-%xmm15 as specified below. |
| 113 | ## |
| 114 | |
| 115 | .align 4 |
| 116 | _vpaes_encrypt_preheat: |
| 117 | adrp x10, Lk_inv@PAGE |
| 118 | add x10, x10, Lk_inv@PAGEOFF |
| 119 | movi v17.16b, #0x0f |
| 120 | ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv |
| 121 | ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo |
| 122 | ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 |
| 123 | ret |
| 124 | |
| 125 | |
| 126 | ## |
| 127 | ## _aes_encrypt_core |
| 128 | ## |
| 129 | ## AES-encrypt %xmm0. |
| 130 | ## |
| 131 | ## Inputs: |
| 132 | ## %xmm0 = input |
| 133 | ## %xmm9-%xmm15 as in _vpaes_preheat |
| 134 | ## (%rdx) = scheduled keys |
| 135 | ## |
| 136 | ## Output in %xmm0 |
| 137 | ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax |
| 138 | ## Preserves %xmm6 - %xmm8 so you get some local vectors |
| 139 | ## |
| 140 | ## |
| 141 | |
| 142 | .align 4 |
| 143 | _vpaes_encrypt_core: |
| 144 | mov x9, x2 |
| 145 | ldr w8, [x2,#240] // pull rounds |
| 146 | adrp x11, Lk_mc_forward@PAGE+16 |
| 147 | add x11, x11, Lk_mc_forward@PAGEOFF+16 |
| 148 | // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo |
| 149 | ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key |
| 150 | and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 |
| 151 | ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 |
| 152 | tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 |
| 153 | // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi |
| 154 | tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 |
| 155 | eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 |
| 156 | eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 |
| 157 | b Lenc_entry |
| 158 | |
| 159 | .align 4 |
| 160 | Lenc_loop: |
| 161 | // middle of middle round |
| 162 | add x10, x11, #0x40 |
| 163 | tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u |
| 164 | ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] |
| 165 | tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t |
| 166 | eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k |
| 167 | tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u |
| 168 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A |
| 169 | tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t |
| 170 | ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] |
| 171 | tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B |
| 172 | eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A |
| 173 | tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D |
| 174 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B |
| 175 | tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C |
| 176 | eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D |
| 177 | and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 |
| 178 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D |
| 179 | sub w8, w8, #1 // nr-- |
| 180 | |
| 181 | Lenc_entry: |
| 182 | // top of round |
| 183 | and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k |
| 184 | ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i |
| 185 | tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k |
| 186 | eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j |
| 187 | tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i |
| 188 | tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j |
| 189 | eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k |
| 190 | eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k |
| 191 | tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak |
| 192 | tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak |
| 193 | eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io |
| 194 | eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo |
| 195 | ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 |
| 196 | cbnz w8, Lenc_loop |
| 197 | |
| 198 | // middle of last round |
| 199 | add x10, x11, #0x80 |
| 200 | // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo |
| 201 | // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 |
| 202 | tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou |
| 203 | ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] |
| 204 | tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t |
| 205 | eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k |
| 206 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A |
| 207 | tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 |
| 208 | ret |
| 209 | |
| 210 | |
| 211 | .globl _vpaes_encrypt |
| 212 | .private_extern _vpaes_encrypt |
| 213 | |
| 214 | .align 4 |
| 215 | _vpaes_encrypt: |
| 216 | stp x29,x30,[sp,#-16]! |
| 217 | add x29,sp,#0 |
| 218 | |
| 219 | ld1 {v7.16b}, [x0] |
| 220 | bl _vpaes_encrypt_preheat |
| 221 | bl _vpaes_encrypt_core |
| 222 | st1 {v0.16b}, [x1] |
| 223 | |
| 224 | ldp x29,x30,[sp],#16 |
| 225 | ret |
| 226 | |
| 227 | |
| 228 | |
| 229 | .align 4 |
| 230 | _vpaes_encrypt_2x: |
| 231 | mov x9, x2 |
| 232 | ldr w8, [x2,#240] // pull rounds |
| 233 | adrp x11, Lk_mc_forward@PAGE+16 |
| 234 | add x11, x11, Lk_mc_forward@PAGEOFF+16 |
| 235 | // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo |
| 236 | ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key |
| 237 | and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 |
| 238 | ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 |
| 239 | and v9.16b, v15.16b, v17.16b |
| 240 | ushr v8.16b, v15.16b, #4 |
| 241 | tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 |
| 242 | tbl v9.16b, {v20.16b}, v9.16b |
| 243 | // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi |
| 244 | tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 |
| 245 | tbl v10.16b, {v21.16b}, v8.16b |
| 246 | eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 |
| 247 | eor v8.16b, v9.16b, v16.16b |
| 248 | eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 |
| 249 | eor v8.16b, v8.16b, v10.16b |
| 250 | b Lenc_2x_entry |
| 251 | |
| 252 | .align 4 |
| 253 | Lenc_2x_loop: |
| 254 | // middle of middle round |
| 255 | add x10, x11, #0x40 |
| 256 | tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u |
| 257 | tbl v12.16b, {v25.16b}, v10.16b |
| 258 | ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] |
| 259 | tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t |
| 260 | tbl v8.16b, {v24.16b}, v11.16b |
| 261 | eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k |
| 262 | eor v12.16b, v12.16b, v16.16b |
| 263 | tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u |
| 264 | tbl v13.16b, {v27.16b}, v10.16b |
| 265 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A |
| 266 | eor v8.16b, v8.16b, v12.16b |
| 267 | tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t |
| 268 | tbl v10.16b, {v26.16b}, v11.16b |
| 269 | ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] |
| 270 | tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B |
| 271 | tbl v11.16b, {v8.16b}, v1.16b |
| 272 | eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A |
| 273 | eor v10.16b, v10.16b, v13.16b |
| 274 | tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D |
| 275 | tbl v8.16b, {v8.16b}, v4.16b |
| 276 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B |
| 277 | eor v11.16b, v11.16b, v10.16b |
| 278 | tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C |
| 279 | tbl v12.16b, {v11.16b},v1.16b |
| 280 | eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D |
| 281 | eor v8.16b, v8.16b, v11.16b |
| 282 | and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 |
| 283 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D |
| 284 | eor v8.16b, v8.16b, v12.16b |
| 285 | sub w8, w8, #1 // nr-- |
| 286 | |
| 287 | Lenc_2x_entry: |
| 288 | // top of round |
| 289 | and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k |
| 290 | ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i |
| 291 | and v9.16b, v8.16b, v17.16b |
| 292 | ushr v8.16b, v8.16b, #4 |
| 293 | tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k |
| 294 | tbl v13.16b, {v19.16b},v9.16b |
| 295 | eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j |
| 296 | eor v9.16b, v9.16b, v8.16b |
| 297 | tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i |
| 298 | tbl v11.16b, {v18.16b},v8.16b |
| 299 | tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j |
| 300 | tbl v12.16b, {v18.16b},v9.16b |
| 301 | eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k |
| 302 | eor v11.16b, v11.16b, v13.16b |
| 303 | eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k |
| 304 | eor v12.16b, v12.16b, v13.16b |
| 305 | tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak |
| 306 | tbl v10.16b, {v18.16b},v11.16b |
| 307 | tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak |
| 308 | tbl v11.16b, {v18.16b},v12.16b |
| 309 | eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io |
| 310 | eor v10.16b, v10.16b, v9.16b |
| 311 | eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo |
| 312 | eor v11.16b, v11.16b, v8.16b |
| 313 | ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 |
| 314 | cbnz w8, Lenc_2x_loop |
| 315 | |
| 316 | // middle of last round |
| 317 | add x10, x11, #0x80 |
| 318 | // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo |
| 319 | // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 |
| 320 | tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou |
| 321 | tbl v12.16b, {v22.16b}, v10.16b |
| 322 | ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] |
| 323 | tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t |
| 324 | tbl v8.16b, {v23.16b}, v11.16b |
| 325 | eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k |
| 326 | eor v12.16b, v12.16b, v16.16b |
| 327 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A |
| 328 | eor v8.16b, v8.16b, v12.16b |
| 329 | tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 |
| 330 | tbl v1.16b, {v8.16b},v1.16b |
| 331 | ret |
| 332 | |
| 333 | |
| 334 | |
| 335 | .align 4 |
| 336 | _vpaes_decrypt_preheat: |
| 337 | adrp x10, Lk_inv@PAGE |
| 338 | add x10, x10, Lk_inv@PAGEOFF |
| 339 | movi v17.16b, #0x0f |
| 340 | adrp x11, Lk_dipt@PAGE |
| 341 | add x11, x11, Lk_dipt@PAGEOFF |
| 342 | ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv |
| 343 | ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo |
| 344 | ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd |
| 345 | ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe |
| 346 | ret |
| 347 | |
| 348 | |
| 349 | ## |
| 350 | ## Decryption core |
| 351 | ## |
| 352 | ## Same API as encryption core. |
| 353 | ## |
| 354 | |
| 355 | .align 4 |
| 356 | _vpaes_decrypt_core: |
| 357 | mov x9, x2 |
| 358 | ldr w8, [x2,#240] // pull rounds |
| 359 | |
| 360 | // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo |
| 361 | lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 |
| 362 | eor x11, x11, #0x30 // xor $0x30, %r11 |
| 363 | adrp x10, Lk_sr@PAGE |
| 364 | add x10, x10, Lk_sr@PAGEOFF |
| 365 | and x11, x11, #0x30 // and $0x30, %r11 |
| 366 | add x11, x11, x10 |
| 367 | adrp x10, Lk_mc_forward@PAGE+48 |
| 368 | add x10, x10, Lk_mc_forward@PAGEOFF+48 |
| 369 | |
| 370 | ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key |
| 371 | and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 |
| 372 | ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 |
| 373 | tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 |
| 374 | ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 |
| 375 | // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi |
| 376 | tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 |
| 377 | eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 |
| 378 | eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 |
| 379 | b Ldec_entry |
| 380 | |
| 381 | .align 4 |
| 382 | Ldec_loop: |
| 383 | // |
| 384 | // Inverse mix columns |
| 385 | // |
| 386 | // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u |
| 387 | // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t |
| 388 | tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u |
| 389 | tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t |
| 390 | eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 |
| 391 | // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu |
| 392 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 393 | // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt |
| 394 | |
| 395 | tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu |
| 396 | tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch |
| 397 | tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt |
| 398 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch |
| 399 | // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu |
| 400 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 401 | // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt |
| 402 | |
| 403 | tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu |
| 404 | tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch |
| 405 | tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt |
| 406 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch |
| 407 | // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu |
| 408 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 409 | // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet |
| 410 | |
| 411 | tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu |
| 412 | tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch |
| 413 | tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet |
| 414 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch |
| 415 | ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 |
| 416 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 417 | sub w8, w8, #1 // sub $1,%rax # nr-- |
| 418 | |
| 419 | Ldec_entry: |
| 420 | // top of round |
| 421 | and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k |
| 422 | ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i |
| 423 | tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k |
| 424 | eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j |
| 425 | tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i |
| 426 | tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j |
| 427 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k |
| 428 | eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k |
| 429 | tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak |
| 430 | tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak |
| 431 | eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io |
| 432 | eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo |
| 433 | ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 |
| 434 | cbnz w8, Ldec_loop |
| 435 | |
| 436 | // middle of last round |
| 437 | // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou |
| 438 | tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou |
| 439 | // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot |
| 440 | ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 |
| 441 | tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t |
| 442 | eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k |
| 443 | eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A |
| 444 | tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 |
| 445 | ret |
| 446 | |
| 447 | |
| 448 | .globl _vpaes_decrypt |
| 449 | .private_extern _vpaes_decrypt |
| 450 | |
| 451 | .align 4 |
| 452 | _vpaes_decrypt: |
| 453 | stp x29,x30,[sp,#-16]! |
| 454 | add x29,sp,#0 |
| 455 | |
| 456 | ld1 {v7.16b}, [x0] |
| 457 | bl _vpaes_decrypt_preheat |
| 458 | bl _vpaes_decrypt_core |
| 459 | st1 {v0.16b}, [x1] |
| 460 | |
| 461 | ldp x29,x30,[sp],#16 |
| 462 | ret |
| 463 | |
| 464 | |
| 465 | // v14-v15 input, v0-v1 output |
| 466 | |
| 467 | .align 4 |
| 468 | _vpaes_decrypt_2x: |
| 469 | mov x9, x2 |
| 470 | ldr w8, [x2,#240] // pull rounds |
| 471 | |
| 472 | // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo |
| 473 | lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 |
| 474 | eor x11, x11, #0x30 // xor $0x30, %r11 |
| 475 | adrp x10, Lk_sr@PAGE |
| 476 | add x10, x10, Lk_sr@PAGEOFF |
| 477 | and x11, x11, #0x30 // and $0x30, %r11 |
| 478 | add x11, x11, x10 |
| 479 | adrp x10, Lk_mc_forward@PAGE+48 |
| 480 | add x10, x10, Lk_mc_forward@PAGEOFF+48 |
| 481 | |
| 482 | ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key |
| 483 | and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 |
| 484 | ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 |
| 485 | and v9.16b, v15.16b, v17.16b |
| 486 | ushr v8.16b, v15.16b, #4 |
| 487 | tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 |
| 488 | tbl v10.16b, {v20.16b},v9.16b |
| 489 | ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 |
| 490 | // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi |
| 491 | tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 |
| 492 | tbl v8.16b, {v21.16b},v8.16b |
| 493 | eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 |
| 494 | eor v10.16b, v10.16b, v16.16b |
| 495 | eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 |
| 496 | eor v8.16b, v8.16b, v10.16b |
| 497 | b Ldec_2x_entry |
| 498 | |
| 499 | .align 4 |
| 500 | Ldec_2x_loop: |
| 501 | // |
| 502 | // Inverse mix columns |
| 503 | // |
| 504 | // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u |
| 505 | // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t |
| 506 | tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u |
| 507 | tbl v12.16b, {v24.16b}, v10.16b |
| 508 | tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t |
| 509 | tbl v9.16b, {v25.16b}, v11.16b |
| 510 | eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 |
| 511 | eor v8.16b, v12.16b, v16.16b |
| 512 | // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu |
| 513 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 514 | eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 515 | // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt |
| 516 | |
| 517 | tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu |
| 518 | tbl v12.16b, {v26.16b}, v10.16b |
| 519 | tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch |
| 520 | tbl v8.16b, {v8.16b},v5.16b |
| 521 | tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt |
| 522 | tbl v9.16b, {v27.16b}, v11.16b |
| 523 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch |
| 524 | eor v8.16b, v8.16b, v12.16b |
| 525 | // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu |
| 526 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 527 | eor v8.16b, v8.16b, v9.16b |
| 528 | // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt |
| 529 | |
| 530 | tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu |
| 531 | tbl v12.16b, {v28.16b}, v10.16b |
| 532 | tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch |
| 533 | tbl v8.16b, {v8.16b},v5.16b |
| 534 | tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt |
| 535 | tbl v9.16b, {v29.16b}, v11.16b |
| 536 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch |
| 537 | eor v8.16b, v8.16b, v12.16b |
| 538 | // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu |
| 539 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 540 | eor v8.16b, v8.16b, v9.16b |
| 541 | // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet |
| 542 | |
| 543 | tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu |
| 544 | tbl v12.16b, {v30.16b}, v10.16b |
| 545 | tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch |
| 546 | tbl v8.16b, {v8.16b},v5.16b |
| 547 | tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet |
| 548 | tbl v9.16b, {v31.16b}, v11.16b |
| 549 | eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch |
| 550 | eor v8.16b, v8.16b, v12.16b |
| 551 | ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 |
| 552 | eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch |
| 553 | eor v8.16b, v8.16b, v9.16b |
| 554 | sub w8, w8, #1 // sub $1,%rax # nr-- |
| 555 | |
| 556 | Ldec_2x_entry: |
| 557 | // top of round |
| 558 | and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k |
| 559 | ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i |
| 560 | and v9.16b, v8.16b, v17.16b |
| 561 | ushr v8.16b, v8.16b, #4 |
| 562 | tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k |
| 563 | tbl v10.16b, {v19.16b},v9.16b |
| 564 | eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j |
| 565 | eor v9.16b, v9.16b, v8.16b |
| 566 | tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i |
| 567 | tbl v11.16b, {v18.16b},v8.16b |
| 568 | tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j |
| 569 | tbl v12.16b, {v18.16b},v9.16b |
| 570 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k |
| 571 | eor v11.16b, v11.16b, v10.16b |
| 572 | eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k |
| 573 | eor v12.16b, v12.16b, v10.16b |
| 574 | tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak |
| 575 | tbl v10.16b, {v18.16b},v11.16b |
| 576 | tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak |
| 577 | tbl v11.16b, {v18.16b},v12.16b |
| 578 | eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io |
| 579 | eor v10.16b, v10.16b, v9.16b |
| 580 | eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo |
| 581 | eor v11.16b, v11.16b, v8.16b |
| 582 | ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 |
| 583 | cbnz w8, Ldec_2x_loop |
| 584 | |
| 585 | // middle of last round |
| 586 | // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou |
| 587 | tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou |
| 588 | tbl v12.16b, {v22.16b}, v10.16b |
| 589 | // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot |
| 590 | tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t |
| 591 | tbl v9.16b, {v23.16b}, v11.16b |
| 592 | ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 |
| 593 | eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k |
| 594 | eor v12.16b, v12.16b, v16.16b |
| 595 | eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A |
| 596 | eor v8.16b, v9.16b, v12.16b |
| 597 | tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 |
| 598 | tbl v1.16b, {v8.16b},v2.16b |
| 599 | ret |
| 600 | |
| 601 | ######################################################## |
| 602 | ## ## |
| 603 | ## AES key schedule ## |
| 604 | ## ## |
| 605 | ######################################################## |
| 606 | |
| 607 | .align 4 |
| 608 | _vpaes_key_preheat: |
| 609 | adrp x10, Lk_inv@PAGE |
| 610 | add x10, x10, Lk_inv@PAGEOFF |
| 611 | movi v16.16b, #0x5b // Lk_s63 |
| 612 | adrp x11, Lk_sb1@PAGE |
| 613 | add x11, x11, Lk_sb1@PAGEOFF |
| 614 | movi v17.16b, #0x0f // Lk_s0F |
| 615 | ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt |
| 616 | adrp x10, Lk_dksd@PAGE |
| 617 | add x10, x10, Lk_dksd@PAGEOFF |
| 618 | ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 |
| 619 | adrp x11, Lk_mc_forward@PAGE |
| 620 | add x11, x11, Lk_mc_forward@PAGEOFF |
| 621 | ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb |
| 622 | ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 |
| 623 | ld1 {v8.2d}, [x10] // Lk_rcon |
| 624 | ld1 {v9.2d}, [x11] // Lk_mc_forward[0] |
| 625 | ret |
| 626 | |
| 627 | |
| 628 | |
| 629 | .align 4 |
| 630 | _vpaes_schedule_core: |
| 631 | stp x29, x30, [sp,#-16]! |
| 632 | add x29,sp,#0 |
| 633 | |
| 634 | bl _vpaes_key_preheat // load the tables |
| 635 | |
| 636 | ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) |
| 637 | |
| 638 | // input transform |
| 639 | mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 |
| 640 | bl _vpaes_schedule_transform |
| 641 | mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 |
| 642 | |
| 643 | adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 |
| 644 | add x10, x10, Lk_sr@PAGEOFF |
| 645 | |
| 646 | add x8, x8, x10 |
| 647 | cbnz w3, Lschedule_am_decrypting |
| 648 | |
| 649 | // encrypting, output zeroth round key after transform |
| 650 | st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) |
| 651 | b Lschedule_go |
| 652 | |
| 653 | Lschedule_am_decrypting: |
| 654 | // decrypting, output zeroth round key after shiftrows |
| 655 | ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 |
| 656 | tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 |
| 657 | st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) |
| 658 | eor x8, x8, #0x30 // xor $0x30, %r8 |
| 659 | |
| 660 | Lschedule_go: |
| 661 | cmp w1, #192 // cmp $192, %esi |
| 662 | b.hi Lschedule_256 |
| 663 | b.eq Lschedule_192 |
| 664 | // 128: fall though |
| 665 | |
| 666 | ## |
| 667 | ## .schedule_128 |
| 668 | ## |
| 669 | ## 128-bit specific part of key schedule. |
| 670 | ## |
| 671 | ## This schedule is really simple, because all its parts |
| 672 | ## are accomplished by the subroutines. |
| 673 | ## |
| 674 | Lschedule_128: |
| 675 | mov x0, #10 // mov $10, %esi |
| 676 | |
| 677 | Loop_schedule_128: |
| 678 | sub x0, x0, #1 // dec %esi |
| 679 | bl _vpaes_schedule_round |
| 680 | cbz x0, Lschedule_mangle_last |
| 681 | bl _vpaes_schedule_mangle // write output |
| 682 | b Loop_schedule_128 |
| 683 | |
| 684 | ## |
| 685 | ## .aes_schedule_192 |
| 686 | ## |
| 687 | ## 192-bit specific part of key schedule. |
| 688 | ## |
| 689 | ## The main body of this schedule is the same as the 128-bit |
| 690 | ## schedule, but with more smearing. The long, high side is |
| 691 | ## stored in %xmm7 as before, and the short, low side is in |
| 692 | ## the high bits of %xmm6. |
| 693 | ## |
| 694 | ## This schedule is somewhat nastier, however, because each |
| 695 | ## round produces 192 bits of key material, or 1.5 round keys. |
| 696 | ## Therefore, on each cycle we do 2 rounds and produce 3 round |
| 697 | ## keys. |
| 698 | ## |
| 699 | .align 4 |
| 700 | Lschedule_192: |
| 701 | sub x0, x0, #8 |
| 702 | ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) |
| 703 | bl _vpaes_schedule_transform // input transform |
| 704 | mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part |
| 705 | eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 |
| 706 | ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros |
| 707 | mov x0, #4 // mov $4, %esi |
| 708 | |
| 709 | Loop_schedule_192: |
| 710 | sub x0, x0, #1 // dec %esi |
| 711 | bl _vpaes_schedule_round |
| 712 | ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 |
| 713 | bl _vpaes_schedule_mangle // save key n |
| 714 | bl _vpaes_schedule_192_smear |
| 715 | bl _vpaes_schedule_mangle // save key n+1 |
| 716 | bl _vpaes_schedule_round |
| 717 | cbz x0, Lschedule_mangle_last |
| 718 | bl _vpaes_schedule_mangle // save key n+2 |
| 719 | bl _vpaes_schedule_192_smear |
| 720 | b Loop_schedule_192 |
| 721 | |
| 722 | ## |
| 723 | ## .aes_schedule_256 |
| 724 | ## |
| 725 | ## 256-bit specific part of key schedule. |
| 726 | ## |
| 727 | ## The structure here is very similar to the 128-bit |
| 728 | ## schedule, but with an additional "low side" in |
| 729 | ## %xmm6. The low side's rounds are the same as the |
| 730 | ## high side's, except no rcon and no rotation. |
| 731 | ## |
| 732 | .align 4 |
| 733 | Lschedule_256: |
| 734 | ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) |
| 735 | bl _vpaes_schedule_transform // input transform |
| 736 | mov x0, #7 // mov $7, %esi |
| 737 | |
| 738 | Loop_schedule_256: |
| 739 | sub x0, x0, #1 // dec %esi |
| 740 | bl _vpaes_schedule_mangle // output low result |
| 741 | mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 |
| 742 | |
| 743 | // high round |
| 744 | bl _vpaes_schedule_round |
| 745 | cbz x0, Lschedule_mangle_last |
| 746 | bl _vpaes_schedule_mangle |
| 747 | |
| 748 | // low round. swap xmm7 and xmm6 |
| 749 | dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 |
| 750 | movi v4.16b, #0 |
| 751 | mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 |
| 752 | mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 |
| 753 | bl _vpaes_schedule_low_round |
| 754 | mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 |
| 755 | |
| 756 | b Loop_schedule_256 |
| 757 | |
| 758 | ## |
| 759 | ## .aes_schedule_mangle_last |
| 760 | ## |
| 761 | ## Mangler for last round of key schedule |
| 762 | ## Mangles %xmm0 |
| 763 | ## when encrypting, outputs out(%xmm0) ^ 63 |
| 764 | ## when decrypting, outputs unskew(%xmm0) |
| 765 | ## |
| 766 | ## Always called right before return... jumps to cleanup and exits |
| 767 | ## |
| 768 | .align 4 |
| 769 | Lschedule_mangle_last: |
| 770 | // schedule last round key from xmm0 |
| 771 | adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew |
| 772 | add x11, x11, Lk_deskew@PAGEOFF |
| 773 | |
| 774 | cbnz w3, Lschedule_mangle_last_dec |
| 775 | |
| 776 | // encrypting |
| 777 | ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 |
| 778 | adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform |
| 779 | add x11, x11, Lk_opt@PAGEOFF |
| 780 | add x2, x2, #32 // add $32, %rdx |
| 781 | tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute |
| 782 | |
| 783 | Lschedule_mangle_last_dec: |
| 784 | ld1 {v20.2d,v21.2d}, [x11] // reload constants |
| 785 | sub x2, x2, #16 // add $-16, %rdx |
| 786 | eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 |
| 787 | bl _vpaes_schedule_transform // output transform |
| 788 | st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key |
| 789 | |
| 790 | // cleanup |
| 791 | eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 |
| 792 | eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 |
| 793 | eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 |
| 794 | eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 |
| 795 | eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 |
| 796 | eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 |
| 797 | eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 |
| 798 | eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 |
| 799 | ldp x29, x30, [sp],#16 |
| 800 | ret |
| 801 | |
| 802 | |
| 803 | ## |
| 804 | ## .aes_schedule_192_smear |
| 805 | ## |
| 806 | ## Smear the short, low side in the 192-bit key schedule. |
| 807 | ## |
| 808 | ## Inputs: |
| 809 | ## %xmm7: high side, b a x y |
| 810 | ## %xmm6: low side, d c 0 0 |
| 811 | ## %xmm13: 0 |
| 812 | ## |
| 813 | ## Outputs: |
| 814 | ## %xmm6: b+c+d b+c 0 0 |
| 815 | ## %xmm0: b+c+d b+c b a |
| 816 | ## |
| 817 | |
| 818 | .align 4 |
| 819 | _vpaes_schedule_192_smear: |
| 820 | movi v1.16b, #0 |
| 821 | dup v0.4s, v7.s[3] |
| 822 | ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 |
| 823 | ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a |
| 824 | eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 |
| 825 | eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 |
| 826 | eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a |
| 827 | mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 |
| 828 | ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros |
| 829 | ret |
| 830 | |
| 831 | |
| 832 | ## |
| 833 | ## .aes_schedule_round |
| 834 | ## |
| 835 | ## Runs one main round of the key schedule on %xmm0, %xmm7 |
| 836 | ## |
| 837 | ## Specifically, runs subbytes on the high dword of %xmm0 |
| 838 | ## then rotates it by one byte and xors into the low dword of |
| 839 | ## %xmm7. |
| 840 | ## |
| 841 | ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for |
| 842 | ## next rcon. |
| 843 | ## |
| 844 | ## Smears the dwords of %xmm7 by xoring the low into the |
| 845 | ## second low, result into third, result into highest. |
| 846 | ## |
| 847 | ## Returns results in %xmm7 = %xmm0. |
| 848 | ## Clobbers %xmm1-%xmm4, %r11. |
| 849 | ## |
| 850 | |
| 851 | .align 4 |
| 852 | _vpaes_schedule_round: |
| 853 | // extract rcon from xmm8 |
| 854 | movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 |
| 855 | ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 |
| 856 | ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 |
| 857 | eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 |
| 858 | |
| 859 | // rotate |
| 860 | dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 |
| 861 | ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 |
| 862 | |
| 863 | // fall through... |
| 864 | |
| 865 | // low round: same as high round, but no rotation and no rcon. |
| 866 | _vpaes_schedule_low_round: |
| 867 | // smear xmm7 |
| 868 | ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 |
| 869 | eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 |
| 870 | ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 |
| 871 | |
| 872 | // subbytes |
| 873 | and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k |
| 874 | ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i |
| 875 | eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 |
| 876 | tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k |
| 877 | eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j |
| 878 | tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i |
| 879 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k |
| 880 | tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j |
| 881 | eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 |
| 882 | tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak |
| 883 | eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k |
| 884 | tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak |
| 885 | eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io |
| 886 | eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo |
| 887 | tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou |
| 888 | tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t |
| 889 | eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output |
| 890 | |
| 891 | // add in smeared stuff |
| 892 | eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 |
| 893 | eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 |
| 894 | ret |
| 895 | |
| 896 | |
| 897 | ## |
| 898 | ## .aes_schedule_transform |
| 899 | ## |
| 900 | ## Linear-transform %xmm0 according to tables at (%r11) |
| 901 | ## |
| 902 | ## Requires that %xmm9 = 0x0F0F... as in preheat |
| 903 | ## Output in %xmm0 |
| 904 | ## Clobbers %xmm1, %xmm2 |
| 905 | ## |
| 906 | |
| 907 | .align 4 |
| 908 | _vpaes_schedule_transform: |
| 909 | and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 |
| 910 | ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 |
| 911 | // vmovdqa (%r11), %xmm2 # lo |
| 912 | tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 |
| 913 | // vmovdqa 16(%r11), %xmm1 # hi |
| 914 | tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 |
| 915 | eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 |
| 916 | ret |
| 917 | |
| 918 | |
| 919 | ## |
| 920 | ## .aes_schedule_mangle |
| 921 | ## |
| 922 | ## Mangle xmm0 from (basis-transformed) standard version |
| 923 | ## to our version. |
| 924 | ## |
| 925 | ## On encrypt, |
| 926 | ## xor with 0x63 |
| 927 | ## multiply by circulant 0,1,1,1 |
| 928 | ## apply shiftrows transform |
| 929 | ## |
| 930 | ## On decrypt, |
| 931 | ## xor with 0x63 |
| 932 | ## multiply by "inverse mixcolumns" circulant E,B,D,9 |
| 933 | ## deskew |
| 934 | ## apply shiftrows transform |
| 935 | ## |
| 936 | ## |
| 937 | ## Writes out to (%rdx), and increments or decrements it |
| 938 | ## Keeps track of round number mod 4 in %r8 |
| 939 | ## Preserves xmm0 |
| 940 | ## Clobbers xmm1-xmm5 |
| 941 | ## |
| 942 | |
| 943 | .align 4 |
| 944 | _vpaes_schedule_mangle: |
| 945 | mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later |
| 946 | // vmovdqa .Lk_mc_forward(%rip),%xmm5 |
| 947 | cbnz w3, Lschedule_mangle_dec |
| 948 | |
| 949 | // encrypting |
| 950 | eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 |
| 951 | add x2, x2, #16 // add $16, %rdx |
| 952 | tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 |
| 953 | tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 |
| 954 | tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 |
| 955 | eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 |
| 956 | ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 |
| 957 | eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 |
| 958 | |
| 959 | b Lschedule_mangle_both |
| 960 | .align 4 |
| 961 | Lschedule_mangle_dec: |
| 962 | // inverse mix columns |
| 963 | // lea .Lk_dksd(%rip),%r11 |
| 964 | ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi |
| 965 | and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo |
| 966 | |
| 967 | // vmovdqa 0x00(%r11), %xmm2 |
| 968 | tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 |
| 969 | // vmovdqa 0x10(%r11), %xmm3 |
| 970 | tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 |
| 971 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 |
| 972 | tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 |
| 973 | |
| 974 | // vmovdqa 0x20(%r11), %xmm2 |
| 975 | tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 |
| 976 | eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 |
| 977 | // vmovdqa 0x30(%r11), %xmm3 |
| 978 | tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 |
| 979 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 |
| 980 | tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 |
| 981 | |
| 982 | // vmovdqa 0x40(%r11), %xmm2 |
| 983 | tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 |
| 984 | eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 |
| 985 | // vmovdqa 0x50(%r11), %xmm3 |
| 986 | tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 |
| 987 | eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 |
| 988 | |
| 989 | // vmovdqa 0x60(%r11), %xmm2 |
| 990 | tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 |
| 991 | tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 |
| 992 | // vmovdqa 0x70(%r11), %xmm4 |
| 993 | tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 |
| 994 | ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 |
| 995 | eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 |
| 996 | eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 |
| 997 | |
| 998 | sub x2, x2, #16 // add $-16, %rdx |
| 999 | |
| 1000 | Lschedule_mangle_both: |
| 1001 | tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 |
| 1002 | add x8, x8, #64-16 // add $-16, %r8 |
| 1003 | and x8, x8, #~(1<<6) // and $0x30, %r8 |
| 1004 | st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) |
| 1005 | ret |
| 1006 | |
| 1007 | |
| 1008 | .globl _vpaes_set_encrypt_key |
| 1009 | .private_extern _vpaes_set_encrypt_key |
| 1010 | |
| 1011 | .align 4 |
| 1012 | _vpaes_set_encrypt_key: |
| 1013 | stp x29,x30,[sp,#-16]! |
| 1014 | add x29,sp,#0 |
| 1015 | stp d8,d9,[sp,#-16]! // ABI spec says so |
| 1016 | |
| 1017 | lsr w9, w1, #5 // shr $5,%eax |
| 1018 | add w9, w9, #5 // $5,%eax |
| 1019 | str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; |
| 1020 | |
| 1021 | mov w3, #0 // mov $0,%ecx |
| 1022 | mov x8, #0x30 // mov $0x30,%r8d |
| 1023 | bl _vpaes_schedule_core |
| 1024 | eor x0, x0, x0 |
| 1025 | |
| 1026 | ldp d8,d9,[sp],#16 |
| 1027 | ldp x29,x30,[sp],#16 |
| 1028 | ret |
| 1029 | |
| 1030 | |
| 1031 | .globl _vpaes_set_decrypt_key |
| 1032 | .private_extern _vpaes_set_decrypt_key |
| 1033 | |
| 1034 | .align 4 |
| 1035 | _vpaes_set_decrypt_key: |
| 1036 | stp x29,x30,[sp,#-16]! |
| 1037 | add x29,sp,#0 |
| 1038 | stp d8,d9,[sp,#-16]! // ABI spec says so |
| 1039 | |
| 1040 | lsr w9, w1, #5 // shr $5,%eax |
| 1041 | add w9, w9, #5 // $5,%eax |
| 1042 | str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; |
| 1043 | lsl w9, w9, #4 // shl $4,%eax |
| 1044 | add x2, x2, #16 // lea 16(%rdx,%rax),%rdx |
| 1045 | add x2, x2, x9 |
| 1046 | |
| 1047 | mov w3, #1 // mov $1,%ecx |
| 1048 | lsr w8, w1, #1 // shr $1,%r8d |
| 1049 | and x8, x8, #32 // and $32,%r8d |
| 1050 | eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 |
| 1051 | bl _vpaes_schedule_core |
| 1052 | |
| 1053 | ldp d8,d9,[sp],#16 |
| 1054 | ldp x29,x30,[sp],#16 |
| 1055 | ret |
| 1056 | |
| 1057 | .globl _vpaes_cbc_encrypt |
| 1058 | .private_extern _vpaes_cbc_encrypt |
| 1059 | |
| 1060 | .align 4 |
| 1061 | _vpaes_cbc_encrypt: |
| 1062 | cbz x2, Lcbc_abort |
| 1063 | cmp w5, #0 // check direction |
| 1064 | b.eq vpaes_cbc_decrypt |
| 1065 | |
| 1066 | stp x29,x30,[sp,#-16]! |
| 1067 | add x29,sp,#0 |
| 1068 | |
| 1069 | mov x17, x2 // reassign |
| 1070 | mov x2, x3 // reassign |
| 1071 | |
| 1072 | ld1 {v0.16b}, [x4] // load ivec |
| 1073 | bl _vpaes_encrypt_preheat |
| 1074 | b Lcbc_enc_loop |
| 1075 | |
| 1076 | .align 4 |
| 1077 | Lcbc_enc_loop: |
| 1078 | ld1 {v7.16b}, [x0],#16 // load input |
| 1079 | eor v7.16b, v7.16b, v0.16b // xor with ivec |
| 1080 | bl _vpaes_encrypt_core |
| 1081 | st1 {v0.16b}, [x1],#16 // save output |
| 1082 | subs x17, x17, #16 |
| 1083 | b.hi Lcbc_enc_loop |
| 1084 | |
| 1085 | st1 {v0.16b}, [x4] // write ivec |
| 1086 | |
| 1087 | ldp x29,x30,[sp],#16 |
| 1088 | Lcbc_abort: |
| 1089 | ret |
| 1090 | |
| 1091 | |
| 1092 | |
| 1093 | .align 4 |
| 1094 | vpaes_cbc_decrypt: |
| 1095 | stp x29,x30,[sp,#-16]! |
| 1096 | add x29,sp,#0 |
| 1097 | stp d8,d9,[sp,#-16]! // ABI spec says so |
| 1098 | stp d10,d11,[sp,#-16]! |
| 1099 | stp d12,d13,[sp,#-16]! |
| 1100 | stp d14,d15,[sp,#-16]! |
| 1101 | |
| 1102 | mov x17, x2 // reassign |
| 1103 | mov x2, x3 // reassign |
| 1104 | ld1 {v6.16b}, [x4] // load ivec |
| 1105 | bl _vpaes_decrypt_preheat |
| 1106 | tst x17, #16 |
| 1107 | b.eq Lcbc_dec_loop2x |
| 1108 | |
| 1109 | ld1 {v7.16b}, [x0], #16 // load input |
| 1110 | bl _vpaes_decrypt_core |
| 1111 | eor v0.16b, v0.16b, v6.16b // xor with ivec |
| 1112 | orr v6.16b, v7.16b, v7.16b // next ivec value |
| 1113 | st1 {v0.16b}, [x1], #16 |
| 1114 | subs x17, x17, #16 |
| 1115 | b.ls Lcbc_dec_done |
| 1116 | |
| 1117 | .align 4 |
| 1118 | Lcbc_dec_loop2x: |
| 1119 | ld1 {v14.16b,v15.16b}, [x0], #32 |
| 1120 | bl _vpaes_decrypt_2x |
| 1121 | eor v0.16b, v0.16b, v6.16b // xor with ivec |
| 1122 | eor v1.16b, v1.16b, v14.16b |
| 1123 | orr v6.16b, v15.16b, v15.16b |
| 1124 | st1 {v0.16b,v1.16b}, [x1], #32 |
| 1125 | subs x17, x17, #32 |
| 1126 | b.hi Lcbc_dec_loop2x |
| 1127 | |
| 1128 | Lcbc_dec_done: |
| 1129 | st1 {v6.16b}, [x4] |
| 1130 | |
| 1131 | ldp d14,d15,[sp],#16 |
| 1132 | ldp d12,d13,[sp],#16 |
| 1133 | ldp d10,d11,[sp],#16 |
| 1134 | ldp d8,d9,[sp],#16 |
| 1135 | ldp x29,x30,[sp],#16 |
| 1136 | ret |
| 1137 | |
| 1138 | .globl _vpaes_ctr32_encrypt_blocks |
| 1139 | .private_extern _vpaes_ctr32_encrypt_blocks |
| 1140 | |
| 1141 | .align 4 |
| 1142 | _vpaes_ctr32_encrypt_blocks: |
| 1143 | stp x29,x30,[sp,#-16]! |
| 1144 | add x29,sp,#0 |
| 1145 | stp d8,d9,[sp,#-16]! // ABI spec says so |
| 1146 | stp d10,d11,[sp,#-16]! |
| 1147 | stp d12,d13,[sp,#-16]! |
| 1148 | stp d14,d15,[sp,#-16]! |
| 1149 | |
| 1150 | cbz x2, Lctr32_done |
| 1151 | |
| 1152 | // Note, unlike the other functions, x2 here is measured in blocks, |
| 1153 | // not bytes. |
| 1154 | mov x17, x2 |
| 1155 | mov x2, x3 |
| 1156 | |
| 1157 | // Load the IV and counter portion. |
| 1158 | ldr w6, [x4, #12] |
| 1159 | ld1 {v7.16b}, [x4] |
| 1160 | |
| 1161 | bl _vpaes_encrypt_preheat |
| 1162 | tst x17, #1 |
| 1163 | rev w6, w6 // The counter is big-endian. |
| 1164 | b.eq Lctr32_prep_loop |
| 1165 | |
| 1166 | // Handle one block so the remaining block count is even for |
| 1167 | // _vpaes_encrypt_2x. |
| 1168 | ld1 {v6.16b}, [x0], #16 // Load input ahead of time |
| 1169 | bl _vpaes_encrypt_core |
| 1170 | eor v0.16b, v0.16b, v6.16b // XOR input and result |
| 1171 | st1 {v0.16b}, [x1], #16 |
| 1172 | subs x17, x17, #1 |
| 1173 | // Update the counter. |
| 1174 | add w6, w6, #1 |
| 1175 | rev w7, w6 |
| 1176 | mov v7.s[3], w7 |
| 1177 | b.ls Lctr32_done |
| 1178 | |
| 1179 | Lctr32_prep_loop: |
| 1180 | // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x |
| 1181 | // uses v14 and v15. |
| 1182 | mov v15.16b, v7.16b |
| 1183 | mov v14.16b, v7.16b |
| 1184 | add w6, w6, #1 |
| 1185 | rev w7, w6 |
| 1186 | mov v15.s[3], w7 |
| 1187 | |
| 1188 | Lctr32_loop: |
| 1189 | ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time |
| 1190 | bl _vpaes_encrypt_2x |
| 1191 | eor v0.16b, v0.16b, v6.16b // XOR input and result |
| 1192 | eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) |
| 1193 | st1 {v0.16b,v1.16b}, [x1], #32 |
| 1194 | subs x17, x17, #2 |
| 1195 | // Update the counter. |
| 1196 | add w7, w6, #1 |
| 1197 | add w6, w6, #2 |
| 1198 | rev w7, w7 |
| 1199 | mov v14.s[3], w7 |
| 1200 | rev w7, w6 |
| 1201 | mov v15.s[3], w7 |
| 1202 | b.hi Lctr32_loop |
| 1203 | |
| 1204 | Lctr32_done: |
| 1205 | ldp d14,d15,[sp],#16 |
| 1206 | ldp d12,d13,[sp],#16 |
| 1207 | ldp d10,d11,[sp],#16 |
| 1208 | ldp d8,d9,[sp],#16 |
| 1209 | ldp x29,x30,[sp],#16 |
| 1210 | ret |
| 1211 | |
| 1212 | #endif // !OPENSSL_NO_ASM |