blob: 787cce2baed2f0faae1a642e47e28621d2a8fed7 [file] [log] [blame]
Robert Sloan8ff03552017-06-14 12:40:58 -07001#include <openssl/arm_arch.h>
2
3#if __ARM_MAX_ARCH__>=7
4.text
Robert Sloan8ff03552017-06-14 12:40:58 -07005
Robert Sloan8ff03552017-06-14 12:40:58 -07006.align 5
7Lrcon:
8.long 0x01,0x01,0x01,0x01
9.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
10.long 0x1b,0x1b,0x1b,0x1b
11
12.globl _aes_hw_set_encrypt_key
13.private_extern _aes_hw_set_encrypt_key
14
15.align 5
16_aes_hw_set_encrypt_key:
17Lenc_key:
18 stp x29,x30,[sp,#-16]!
19 add x29,sp,#0
20 mov x3,#-1
21 cmp x0,#0
22 b.eq Lenc_key_abort
23 cmp x2,#0
24 b.eq Lenc_key_abort
25 mov x3,#-2
26 cmp w1,#128
27 b.lt Lenc_key_abort
28 cmp w1,#256
29 b.gt Lenc_key_abort
30 tst w1,#0x3f
31 b.ne Lenc_key_abort
32
33 adr x3,Lrcon
34 cmp w1,#192
35
36 eor v0.16b,v0.16b,v0.16b
37 ld1 {v3.16b},[x0],#16
38 mov w1,#8 // reuse w1
39 ld1 {v1.4s,v2.4s},[x3],#32
40
41 b.lt Loop128
42 b.eq L192
43 b L256
44
45.align 4
46Loop128:
47 tbl v6.16b,{v3.16b},v2.16b
48 ext v5.16b,v0.16b,v3.16b,#12
49 st1 {v3.4s},[x2],#16
50 aese v6.16b,v0.16b
51 subs w1,w1,#1
52
53 eor v3.16b,v3.16b,v5.16b
54 ext v5.16b,v0.16b,v5.16b,#12
55 eor v3.16b,v3.16b,v5.16b
56 ext v5.16b,v0.16b,v5.16b,#12
57 eor v6.16b,v6.16b,v1.16b
58 eor v3.16b,v3.16b,v5.16b
59 shl v1.16b,v1.16b,#1
60 eor v3.16b,v3.16b,v6.16b
61 b.ne Loop128
62
63 ld1 {v1.4s},[x3]
64
65 tbl v6.16b,{v3.16b},v2.16b
66 ext v5.16b,v0.16b,v3.16b,#12
67 st1 {v3.4s},[x2],#16
68 aese v6.16b,v0.16b
69
70 eor v3.16b,v3.16b,v5.16b
71 ext v5.16b,v0.16b,v5.16b,#12
72 eor v3.16b,v3.16b,v5.16b
73 ext v5.16b,v0.16b,v5.16b,#12
74 eor v6.16b,v6.16b,v1.16b
75 eor v3.16b,v3.16b,v5.16b
76 shl v1.16b,v1.16b,#1
77 eor v3.16b,v3.16b,v6.16b
78
79 tbl v6.16b,{v3.16b},v2.16b
80 ext v5.16b,v0.16b,v3.16b,#12
81 st1 {v3.4s},[x2],#16
82 aese v6.16b,v0.16b
83
84 eor v3.16b,v3.16b,v5.16b
85 ext v5.16b,v0.16b,v5.16b,#12
86 eor v3.16b,v3.16b,v5.16b
87 ext v5.16b,v0.16b,v5.16b,#12
88 eor v6.16b,v6.16b,v1.16b
89 eor v3.16b,v3.16b,v5.16b
90 eor v3.16b,v3.16b,v6.16b
91 st1 {v3.4s},[x2]
92 add x2,x2,#0x50
93
94 mov w12,#10
95 b Ldone
96
97.align 4
98L192:
99 ld1 {v4.8b},[x0],#8
100 movi v6.16b,#8 // borrow v6.16b
101 st1 {v3.4s},[x2],#16
102 sub v2.16b,v2.16b,v6.16b // adjust the mask
103
104Loop192:
105 tbl v6.16b,{v4.16b},v2.16b
106 ext v5.16b,v0.16b,v3.16b,#12
107 st1 {v4.8b},[x2],#8
108 aese v6.16b,v0.16b
109 subs w1,w1,#1
110
111 eor v3.16b,v3.16b,v5.16b
112 ext v5.16b,v0.16b,v5.16b,#12
113 eor v3.16b,v3.16b,v5.16b
114 ext v5.16b,v0.16b,v5.16b,#12
115 eor v3.16b,v3.16b,v5.16b
116
117 dup v5.4s,v3.s[3]
118 eor v5.16b,v5.16b,v4.16b
119 eor v6.16b,v6.16b,v1.16b
120 ext v4.16b,v0.16b,v4.16b,#12
121 shl v1.16b,v1.16b,#1
122 eor v4.16b,v4.16b,v5.16b
123 eor v3.16b,v3.16b,v6.16b
124 eor v4.16b,v4.16b,v6.16b
125 st1 {v3.4s},[x2],#16
126 b.ne Loop192
127
128 mov w12,#12
129 add x2,x2,#0x20
130 b Ldone
131
132.align 4
133L256:
134 ld1 {v4.16b},[x0]
135 mov w1,#7
136 mov w12,#14
137 st1 {v3.4s},[x2],#16
138
139Loop256:
140 tbl v6.16b,{v4.16b},v2.16b
141 ext v5.16b,v0.16b,v3.16b,#12
142 st1 {v4.4s},[x2],#16
143 aese v6.16b,v0.16b
144 subs w1,w1,#1
145
146 eor v3.16b,v3.16b,v5.16b
147 ext v5.16b,v0.16b,v5.16b,#12
148 eor v3.16b,v3.16b,v5.16b
149 ext v5.16b,v0.16b,v5.16b,#12
150 eor v6.16b,v6.16b,v1.16b
151 eor v3.16b,v3.16b,v5.16b
152 shl v1.16b,v1.16b,#1
153 eor v3.16b,v3.16b,v6.16b
154 st1 {v3.4s},[x2],#16
155 b.eq Ldone
156
157 dup v6.4s,v3.s[3] // just splat
158 ext v5.16b,v0.16b,v4.16b,#12
159 aese v6.16b,v0.16b
160
161 eor v4.16b,v4.16b,v5.16b
162 ext v5.16b,v0.16b,v5.16b,#12
163 eor v4.16b,v4.16b,v5.16b
164 ext v5.16b,v0.16b,v5.16b,#12
165 eor v4.16b,v4.16b,v5.16b
166
167 eor v4.16b,v4.16b,v6.16b
168 b Loop256
169
170Ldone:
171 str w12,[x2]
172 mov x3,#0
173
174Lenc_key_abort:
175 mov x0,x3 // return value
176 ldr x29,[sp],#16
177 ret
178
179
180.globl _aes_hw_set_decrypt_key
181.private_extern _aes_hw_set_decrypt_key
182
183.align 5
184_aes_hw_set_decrypt_key:
185 stp x29,x30,[sp,#-16]!
186 add x29,sp,#0
187 bl Lenc_key
188
189 cmp x0,#0
190 b.ne Ldec_key_abort
191
192 sub x2,x2,#240 // restore original x2
193 mov x4,#-16
194 add x0,x2,x12,lsl#4 // end of key schedule
195
196 ld1 {v0.4s},[x2]
197 ld1 {v1.4s},[x0]
198 st1 {v0.4s},[x0],x4
199 st1 {v1.4s},[x2],#16
200
201Loop_imc:
202 ld1 {v0.4s},[x2]
203 ld1 {v1.4s},[x0]
204 aesimc v0.16b,v0.16b
205 aesimc v1.16b,v1.16b
206 st1 {v0.4s},[x0],x4
207 st1 {v1.4s},[x2],#16
208 cmp x0,x2
209 b.hi Loop_imc
210
211 ld1 {v0.4s},[x2]
212 aesimc v0.16b,v0.16b
213 st1 {v0.4s},[x0]
214
215 eor x0,x0,x0 // return value
216Ldec_key_abort:
217 ldp x29,x30,[sp],#16
218 ret
219
220.globl _aes_hw_encrypt
221.private_extern _aes_hw_encrypt
222
223.align 5
224_aes_hw_encrypt:
225 ldr w3,[x2,#240]
226 ld1 {v0.4s},[x2],#16
227 ld1 {v2.16b},[x0]
228 sub w3,w3,#2
229 ld1 {v1.4s},[x2],#16
230
231Loop_enc:
232 aese v2.16b,v0.16b
233 aesmc v2.16b,v2.16b
234 ld1 {v0.4s},[x2],#16
235 subs w3,w3,#2
236 aese v2.16b,v1.16b
237 aesmc v2.16b,v2.16b
238 ld1 {v1.4s},[x2],#16
239 b.gt Loop_enc
240
241 aese v2.16b,v0.16b
242 aesmc v2.16b,v2.16b
243 ld1 {v0.4s},[x2]
244 aese v2.16b,v1.16b
245 eor v2.16b,v2.16b,v0.16b
246
247 st1 {v2.16b},[x1]
248 ret
249
250.globl _aes_hw_decrypt
251.private_extern _aes_hw_decrypt
252
253.align 5
254_aes_hw_decrypt:
255 ldr w3,[x2,#240]
256 ld1 {v0.4s},[x2],#16
257 ld1 {v2.16b},[x0]
258 sub w3,w3,#2
259 ld1 {v1.4s},[x2],#16
260
261Loop_dec:
262 aesd v2.16b,v0.16b
263 aesimc v2.16b,v2.16b
264 ld1 {v0.4s},[x2],#16
265 subs w3,w3,#2
266 aesd v2.16b,v1.16b
267 aesimc v2.16b,v2.16b
268 ld1 {v1.4s},[x2],#16
269 b.gt Loop_dec
270
271 aesd v2.16b,v0.16b
272 aesimc v2.16b,v2.16b
273 ld1 {v0.4s},[x2]
274 aesd v2.16b,v1.16b
275 eor v2.16b,v2.16b,v0.16b
276
277 st1 {v2.16b},[x1]
278 ret
279
280.globl _aes_hw_cbc_encrypt
281.private_extern _aes_hw_cbc_encrypt
282
283.align 5
284_aes_hw_cbc_encrypt:
285 stp x29,x30,[sp,#-16]!
286 add x29,sp,#0
287 subs x2,x2,#16
288 mov x8,#16
289 b.lo Lcbc_abort
290 csel x8,xzr,x8,eq
291
292 cmp w5,#0 // en- or decrypting?
293 ldr w5,[x3,#240]
294 and x2,x2,#-16
295 ld1 {v6.16b},[x4]
296 ld1 {v0.16b},[x0],x8
297
298 ld1 {v16.4s,v17.4s},[x3] // load key schedule...
299 sub w5,w5,#6
300 add x7,x3,x5,lsl#4 // pointer to last 7 round keys
301 sub w5,w5,#2
302 ld1 {v18.4s,v19.4s},[x7],#32
303 ld1 {v20.4s,v21.4s},[x7],#32
304 ld1 {v22.4s,v23.4s},[x7],#32
305 ld1 {v7.4s},[x7]
306
307 add x7,x3,#32
308 mov w6,w5
309 b.eq Lcbc_dec
310
311 cmp w5,#2
312 eor v0.16b,v0.16b,v6.16b
313 eor v5.16b,v16.16b,v7.16b
314 b.eq Lcbc_enc128
315
316 ld1 {v2.4s,v3.4s},[x7]
317 add x7,x3,#16
318 add x6,x3,#16*4
319 add x12,x3,#16*5
320 aese v0.16b,v16.16b
321 aesmc v0.16b,v0.16b
322 add x14,x3,#16*6
323 add x3,x3,#16*7
324 b Lenter_cbc_enc
325
326.align 4
327Loop_cbc_enc:
328 aese v0.16b,v16.16b
329 aesmc v0.16b,v0.16b
330 st1 {v6.16b},[x1],#16
331Lenter_cbc_enc:
332 aese v0.16b,v17.16b
333 aesmc v0.16b,v0.16b
334 aese v0.16b,v2.16b
335 aesmc v0.16b,v0.16b
336 ld1 {v16.4s},[x6]
337 cmp w5,#4
338 aese v0.16b,v3.16b
339 aesmc v0.16b,v0.16b
340 ld1 {v17.4s},[x12]
341 b.eq Lcbc_enc192
342
343 aese v0.16b,v16.16b
344 aesmc v0.16b,v0.16b
345 ld1 {v16.4s},[x14]
346 aese v0.16b,v17.16b
347 aesmc v0.16b,v0.16b
348 ld1 {v17.4s},[x3]
349 nop
350
351Lcbc_enc192:
352 aese v0.16b,v16.16b
353 aesmc v0.16b,v0.16b
354 subs x2,x2,#16
355 aese v0.16b,v17.16b
356 aesmc v0.16b,v0.16b
357 csel x8,xzr,x8,eq
358 aese v0.16b,v18.16b
359 aesmc v0.16b,v0.16b
360 aese v0.16b,v19.16b
361 aesmc v0.16b,v0.16b
362 ld1 {v16.16b},[x0],x8
363 aese v0.16b,v20.16b
364 aesmc v0.16b,v0.16b
365 eor v16.16b,v16.16b,v5.16b
366 aese v0.16b,v21.16b
367 aesmc v0.16b,v0.16b
368 ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
369 aese v0.16b,v22.16b
370 aesmc v0.16b,v0.16b
371 aese v0.16b,v23.16b
372 eor v6.16b,v0.16b,v7.16b
373 b.hs Loop_cbc_enc
374
375 st1 {v6.16b},[x1],#16
376 b Lcbc_done
377
378.align 5
379Lcbc_enc128:
380 ld1 {v2.4s,v3.4s},[x7]
381 aese v0.16b,v16.16b
382 aesmc v0.16b,v0.16b
383 b Lenter_cbc_enc128
384Loop_cbc_enc128:
385 aese v0.16b,v16.16b
386 aesmc v0.16b,v0.16b
387 st1 {v6.16b},[x1],#16
388Lenter_cbc_enc128:
389 aese v0.16b,v17.16b
390 aesmc v0.16b,v0.16b
391 subs x2,x2,#16
392 aese v0.16b,v2.16b
393 aesmc v0.16b,v0.16b
394 csel x8,xzr,x8,eq
395 aese v0.16b,v3.16b
396 aesmc v0.16b,v0.16b
397 aese v0.16b,v18.16b
398 aesmc v0.16b,v0.16b
399 aese v0.16b,v19.16b
400 aesmc v0.16b,v0.16b
401 ld1 {v16.16b},[x0],x8
402 aese v0.16b,v20.16b
403 aesmc v0.16b,v0.16b
404 aese v0.16b,v21.16b
405 aesmc v0.16b,v0.16b
406 aese v0.16b,v22.16b
407 aesmc v0.16b,v0.16b
408 eor v16.16b,v16.16b,v5.16b
409 aese v0.16b,v23.16b
410 eor v6.16b,v0.16b,v7.16b
411 b.hs Loop_cbc_enc128
412
413 st1 {v6.16b},[x1],#16
414 b Lcbc_done
415.align 5
416Lcbc_dec:
417 ld1 {v18.16b},[x0],#16
418 subs x2,x2,#32 // bias
419 add w6,w5,#2
420 orr v3.16b,v0.16b,v0.16b
421 orr v1.16b,v0.16b,v0.16b
422 orr v19.16b,v18.16b,v18.16b
423 b.lo Lcbc_dec_tail
424
425 orr v1.16b,v18.16b,v18.16b
426 ld1 {v18.16b},[x0],#16
427 orr v2.16b,v0.16b,v0.16b
428 orr v3.16b,v1.16b,v1.16b
429 orr v19.16b,v18.16b,v18.16b
430
431Loop3x_cbc_dec:
432 aesd v0.16b,v16.16b
433 aesimc v0.16b,v0.16b
434 aesd v1.16b,v16.16b
435 aesimc v1.16b,v1.16b
436 aesd v18.16b,v16.16b
437 aesimc v18.16b,v18.16b
438 ld1 {v16.4s},[x7],#16
439 subs w6,w6,#2
440 aesd v0.16b,v17.16b
441 aesimc v0.16b,v0.16b
442 aesd v1.16b,v17.16b
443 aesimc v1.16b,v1.16b
444 aesd v18.16b,v17.16b
445 aesimc v18.16b,v18.16b
446 ld1 {v17.4s},[x7],#16
447 b.gt Loop3x_cbc_dec
448
449 aesd v0.16b,v16.16b
450 aesimc v0.16b,v0.16b
451 aesd v1.16b,v16.16b
452 aesimc v1.16b,v1.16b
453 aesd v18.16b,v16.16b
454 aesimc v18.16b,v18.16b
455 eor v4.16b,v6.16b,v7.16b
456 subs x2,x2,#0x30
457 eor v5.16b,v2.16b,v7.16b
458 csel x6,x2,x6,lo // x6, w6, is zero at this point
459 aesd v0.16b,v17.16b
460 aesimc v0.16b,v0.16b
461 aesd v1.16b,v17.16b
462 aesimc v1.16b,v1.16b
463 aesd v18.16b,v17.16b
464 aesimc v18.16b,v18.16b
465 eor v17.16b,v3.16b,v7.16b
466 add x0,x0,x6 // x0 is adjusted in such way that
467 // at exit from the loop v1.16b-v18.16b
468 // are loaded with last "words"
469 orr v6.16b,v19.16b,v19.16b
470 mov x7,x3
471 aesd v0.16b,v20.16b
472 aesimc v0.16b,v0.16b
473 aesd v1.16b,v20.16b
474 aesimc v1.16b,v1.16b
475 aesd v18.16b,v20.16b
476 aesimc v18.16b,v18.16b
477 ld1 {v2.16b},[x0],#16
478 aesd v0.16b,v21.16b
479 aesimc v0.16b,v0.16b
480 aesd v1.16b,v21.16b
481 aesimc v1.16b,v1.16b
482 aesd v18.16b,v21.16b
483 aesimc v18.16b,v18.16b
484 ld1 {v3.16b},[x0],#16
485 aesd v0.16b,v22.16b
486 aesimc v0.16b,v0.16b
487 aesd v1.16b,v22.16b
488 aesimc v1.16b,v1.16b
489 aesd v18.16b,v22.16b
490 aesimc v18.16b,v18.16b
491 ld1 {v19.16b},[x0],#16
492 aesd v0.16b,v23.16b
493 aesd v1.16b,v23.16b
494 aesd v18.16b,v23.16b
495 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
496 add w6,w5,#2
497 eor v4.16b,v4.16b,v0.16b
498 eor v5.16b,v5.16b,v1.16b
499 eor v18.16b,v18.16b,v17.16b
500 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
501 st1 {v4.16b},[x1],#16
502 orr v0.16b,v2.16b,v2.16b
503 st1 {v5.16b},[x1],#16
504 orr v1.16b,v3.16b,v3.16b
505 st1 {v18.16b},[x1],#16
506 orr v18.16b,v19.16b,v19.16b
507 b.hs Loop3x_cbc_dec
508
509 cmn x2,#0x30
510 b.eq Lcbc_done
511 nop
512
513Lcbc_dec_tail:
514 aesd v1.16b,v16.16b
515 aesimc v1.16b,v1.16b
516 aesd v18.16b,v16.16b
517 aesimc v18.16b,v18.16b
518 ld1 {v16.4s},[x7],#16
519 subs w6,w6,#2
520 aesd v1.16b,v17.16b
521 aesimc v1.16b,v1.16b
522 aesd v18.16b,v17.16b
523 aesimc v18.16b,v18.16b
524 ld1 {v17.4s},[x7],#16
525 b.gt Lcbc_dec_tail
526
527 aesd v1.16b,v16.16b
528 aesimc v1.16b,v1.16b
529 aesd v18.16b,v16.16b
530 aesimc v18.16b,v18.16b
531 aesd v1.16b,v17.16b
532 aesimc v1.16b,v1.16b
533 aesd v18.16b,v17.16b
534 aesimc v18.16b,v18.16b
535 aesd v1.16b,v20.16b
536 aesimc v1.16b,v1.16b
537 aesd v18.16b,v20.16b
538 aesimc v18.16b,v18.16b
539 cmn x2,#0x20
540 aesd v1.16b,v21.16b
541 aesimc v1.16b,v1.16b
542 aesd v18.16b,v21.16b
543 aesimc v18.16b,v18.16b
544 eor v5.16b,v6.16b,v7.16b
545 aesd v1.16b,v22.16b
546 aesimc v1.16b,v1.16b
547 aesd v18.16b,v22.16b
548 aesimc v18.16b,v18.16b
549 eor v17.16b,v3.16b,v7.16b
550 aesd v1.16b,v23.16b
551 aesd v18.16b,v23.16b
552 b.eq Lcbc_dec_one
553 eor v5.16b,v5.16b,v1.16b
554 eor v17.16b,v17.16b,v18.16b
555 orr v6.16b,v19.16b,v19.16b
556 st1 {v5.16b},[x1],#16
557 st1 {v17.16b},[x1],#16
558 b Lcbc_done
559
560Lcbc_dec_one:
561 eor v5.16b,v5.16b,v18.16b
562 orr v6.16b,v19.16b,v19.16b
563 st1 {v5.16b},[x1],#16
564
565Lcbc_done:
566 st1 {v6.16b},[x4]
567Lcbc_abort:
568 ldr x29,[sp],#16
569 ret
570
571.globl _aes_hw_ctr32_encrypt_blocks
572.private_extern _aes_hw_ctr32_encrypt_blocks
573
574.align 5
575_aes_hw_ctr32_encrypt_blocks:
576 stp x29,x30,[sp,#-16]!
577 add x29,sp,#0
578 ldr w5,[x3,#240]
579
580 ldr w8, [x4, #12]
581 ld1 {v0.4s},[x4]
582
583 ld1 {v16.4s,v17.4s},[x3] // load key schedule...
584 sub w5,w5,#4
585 mov x12,#16
586 cmp x2,#2
587 add x7,x3,x5,lsl#4 // pointer to last 5 round keys
588 sub w5,w5,#2
589 ld1 {v20.4s,v21.4s},[x7],#32
590 ld1 {v22.4s,v23.4s},[x7],#32
591 ld1 {v7.4s},[x7]
592 add x7,x3,#32
593 mov w6,w5
594 csel x12,xzr,x12,lo
595#ifndef __ARMEB__
596 rev w8, w8
597#endif
598 orr v1.16b,v0.16b,v0.16b
599 add w10, w8, #1
600 orr v18.16b,v0.16b,v0.16b
601 add w8, w8, #2
602 orr v6.16b,v0.16b,v0.16b
603 rev w10, w10
604 mov v1.s[3],w10
605 b.ls Lctr32_tail
606 rev w12, w8
607 sub x2,x2,#3 // bias
608 mov v18.s[3],w12
609 b Loop3x_ctr32
610
611.align 4
612Loop3x_ctr32:
613 aese v0.16b,v16.16b
614 aesmc v0.16b,v0.16b
615 aese v1.16b,v16.16b
616 aesmc v1.16b,v1.16b
617 aese v18.16b,v16.16b
618 aesmc v18.16b,v18.16b
619 ld1 {v16.4s},[x7],#16
620 subs w6,w6,#2
621 aese v0.16b,v17.16b
622 aesmc v0.16b,v0.16b
623 aese v1.16b,v17.16b
624 aesmc v1.16b,v1.16b
625 aese v18.16b,v17.16b
626 aesmc v18.16b,v18.16b
627 ld1 {v17.4s},[x7],#16
628 b.gt Loop3x_ctr32
629
630 aese v0.16b,v16.16b
631 aesmc v4.16b,v0.16b
632 aese v1.16b,v16.16b
633 aesmc v5.16b,v1.16b
634 ld1 {v2.16b},[x0],#16
635 orr v0.16b,v6.16b,v6.16b
636 aese v18.16b,v16.16b
637 aesmc v18.16b,v18.16b
638 ld1 {v3.16b},[x0],#16
639 orr v1.16b,v6.16b,v6.16b
640 aese v4.16b,v17.16b
641 aesmc v4.16b,v4.16b
642 aese v5.16b,v17.16b
643 aesmc v5.16b,v5.16b
644 ld1 {v19.16b},[x0],#16
645 mov x7,x3
646 aese v18.16b,v17.16b
647 aesmc v17.16b,v18.16b
648 orr v18.16b,v6.16b,v6.16b
649 add w9,w8,#1
650 aese v4.16b,v20.16b
651 aesmc v4.16b,v4.16b
652 aese v5.16b,v20.16b
653 aesmc v5.16b,v5.16b
654 eor v2.16b,v2.16b,v7.16b
655 add w10,w8,#2
656 aese v17.16b,v20.16b
657 aesmc v17.16b,v17.16b
658 eor v3.16b,v3.16b,v7.16b
659 add w8,w8,#3
660 aese v4.16b,v21.16b
661 aesmc v4.16b,v4.16b
662 aese v5.16b,v21.16b
663 aesmc v5.16b,v5.16b
664 eor v19.16b,v19.16b,v7.16b
665 rev w9,w9
666 aese v17.16b,v21.16b
667 aesmc v17.16b,v17.16b
668 mov v0.s[3], w9
669 rev w10,w10
670 aese v4.16b,v22.16b
671 aesmc v4.16b,v4.16b
672 aese v5.16b,v22.16b
673 aesmc v5.16b,v5.16b
674 mov v1.s[3], w10
675 rev w12,w8
676 aese v17.16b,v22.16b
677 aesmc v17.16b,v17.16b
678 mov v18.s[3], w12
679 subs x2,x2,#3
680 aese v4.16b,v23.16b
681 aese v5.16b,v23.16b
682 aese v17.16b,v23.16b
683
684 eor v2.16b,v2.16b,v4.16b
685 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
686 st1 {v2.16b},[x1],#16
687 eor v3.16b,v3.16b,v5.16b
688 mov w6,w5
689 st1 {v3.16b},[x1],#16
690 eor v19.16b,v19.16b,v17.16b
691 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
692 st1 {v19.16b},[x1],#16
693 b.hs Loop3x_ctr32
694
695 adds x2,x2,#3
696 b.eq Lctr32_done
697 cmp x2,#1
698 mov x12,#16
699 csel x12,xzr,x12,eq
700
701Lctr32_tail:
702 aese v0.16b,v16.16b
703 aesmc v0.16b,v0.16b
704 aese v1.16b,v16.16b
705 aesmc v1.16b,v1.16b
706 ld1 {v16.4s},[x7],#16
707 subs w6,w6,#2
708 aese v0.16b,v17.16b
709 aesmc v0.16b,v0.16b
710 aese v1.16b,v17.16b
711 aesmc v1.16b,v1.16b
712 ld1 {v17.4s},[x7],#16
713 b.gt Lctr32_tail
714
715 aese v0.16b,v16.16b
716 aesmc v0.16b,v0.16b
717 aese v1.16b,v16.16b
718 aesmc v1.16b,v1.16b
719 aese v0.16b,v17.16b
720 aesmc v0.16b,v0.16b
721 aese v1.16b,v17.16b
722 aesmc v1.16b,v1.16b
723 ld1 {v2.16b},[x0],x12
724 aese v0.16b,v20.16b
725 aesmc v0.16b,v0.16b
726 aese v1.16b,v20.16b
727 aesmc v1.16b,v1.16b
728 ld1 {v3.16b},[x0]
729 aese v0.16b,v21.16b
730 aesmc v0.16b,v0.16b
731 aese v1.16b,v21.16b
732 aesmc v1.16b,v1.16b
733 eor v2.16b,v2.16b,v7.16b
734 aese v0.16b,v22.16b
735 aesmc v0.16b,v0.16b
736 aese v1.16b,v22.16b
737 aesmc v1.16b,v1.16b
738 eor v3.16b,v3.16b,v7.16b
739 aese v0.16b,v23.16b
740 aese v1.16b,v23.16b
741
742 cmp x2,#1
743 eor v2.16b,v2.16b,v0.16b
744 eor v3.16b,v3.16b,v1.16b
745 st1 {v2.16b},[x1],#16
746 b.eq Lctr32_done
747 st1 {v3.16b},[x1]
748
749Lctr32_done:
750 ldr x29,[sp],#16
751 ret
752
753#endif