blob: 65c9f22a989e3127b8a1468c28fc0ace636c8ef2 [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Pete Bentley0c61efe2019-08-13 09:32:23 +01004#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07007#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
Robert Sloan726e9d12018-09-11 11:45:04 -070010
11#if !defined(OPENSSL_NO_ASM)
Kenny Rootb8494592015-09-25 02:29:14 +000012#if defined(__arm__)
Robert Sloan726e9d12018-09-11 11:45:04 -070013#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
Kenny Rootb8494592015-09-25 02:29:14 +000016#include <openssl/arm_arch.h>
Adam Langleyd9e397b2015-01-22 14:27:53 -080017
18.text
19.fpu neon
20.code 32
Robert Sloan8ff03552017-06-14 12:40:58 -070021#undef __thumb2__
Adam Langleye9ada862015-05-11 17:20:37 -070022.globl gcm_init_v8
David Benjamin4969cc92016-04-22 15:02:23 -040023.hidden gcm_init_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -080024.type gcm_init_v8,%function
25.align 4
26gcm_init_v8:
Adam Langleye9ada862015-05-11 17:20:37 -070027 vld1.64 {q9},[r1] @ load input H
28 vmov.i8 q11,#0xe1
29 vshl.i64 q11,q11,#57 @ 0xc2.0
30 vext.8 q3,q9,q9,#8
31 vshr.u64 q10,q11,#63
Adam Langleyd9e397b2015-01-22 14:27:53 -080032 vdup.32 q9,d18[1]
Adam Langleye9ada862015-05-11 17:20:37 -070033 vext.8 q8,q10,q11,#8 @ t0=0xc2....01
34 vshr.u64 q10,q3,#63
Adam Langleyd9e397b2015-01-22 14:27:53 -080035 vshr.s32 q9,q9,#31 @ broadcast carry bit
Adam Langleye9ada862015-05-11 17:20:37 -070036 vand q10,q10,q8
Adam Langleyd9e397b2015-01-22 14:27:53 -080037 vshl.i64 q3,q3,#1
Adam Langleye9ada862015-05-11 17:20:37 -070038 vext.8 q10,q10,q10,#8
39 vand q8,q8,q9
40 vorr q3,q3,q10 @ H<<<=1
41 veor q12,q3,q8 @ twisted H
42 vst1.64 {q12},[r0]! @ store Htable[0]
43
44 @ calculate H^2
45 vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
46.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
47 veor q8,q8,q12
48.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
49.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
50
51 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
52 veor q10,q0,q2
53 veor q1,q1,q9
54 veor q1,q1,q10
55.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
56
57 vmov d4,d3 @ Xh|Xm - 256-bit result
58 vmov d3,d0 @ Xm is rotated Xl
59 veor q0,q1,q10
60
61 vext.8 q10,q0,q0,#8 @ 2nd phase
62.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
63 veor q10,q10,q2
64 veor q14,q0,q10
65
66 vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
67 veor q9,q9,q14
68 vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
69 vst1.64 {q13,q14},[r0] @ store Htable[1..2]
Adam Langleyd9e397b2015-01-22 14:27:53 -080070
71 bx lr
72.size gcm_init_v8,.-gcm_init_v8
Adam Langleye9ada862015-05-11 17:20:37 -070073.globl gcm_gmult_v8
David Benjamin4969cc92016-04-22 15:02:23 -040074.hidden gcm_gmult_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -080075.type gcm_gmult_v8,%function
76.align 4
77gcm_gmult_v8:
Adam Langleye9ada862015-05-11 17:20:37 -070078 vld1.64 {q9},[r0] @ load Xi
79 vmov.i8 q11,#0xe1
80 vld1.64 {q12,q13},[r1] @ load twisted H, ...
Adam Langleyd9e397b2015-01-22 14:27:53 -080081 vshl.u64 q11,q11,#57
82#ifndef __ARMEB__
83 vrev64.8 q9,q9
84#endif
Adam Langleye9ada862015-05-11 17:20:37 -070085 vext.8 q3,q9,q9,#8
Adam Langleyd9e397b2015-01-22 14:27:53 -080086
Kenny Rootb8494592015-09-25 02:29:14 +000087.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -070088 veor q9,q9,q3 @ Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +000089.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
90.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -070091
92 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
93 veor q10,q0,q2
94 veor q1,q1,q9
95 veor q1,q1,q10
96.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
97
98 vmov d4,d3 @ Xh|Xm - 256-bit result
99 vmov d3,d0 @ Xm is rotated Xl
100 veor q0,q1,q10
101
102 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
103.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
104 veor q10,q10,q2
105 veor q0,q0,q10
106
107#ifndef __ARMEB__
108 vrev64.8 q0,q0
109#endif
110 vext.8 q0,q0,q0,#8
111 vst1.64 {q0},[r0] @ write out Xi
112
113 bx lr
114.size gcm_gmult_v8,.-gcm_gmult_v8
115.globl gcm_ghash_v8
David Benjamin4969cc92016-04-22 15:02:23 -0400116.hidden gcm_ghash_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800117.type gcm_ghash_v8,%function
118.align 4
119gcm_ghash_v8:
Adam Langleye9ada862015-05-11 17:20:37 -0700120 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
121 vld1.64 {q0},[r0] @ load [rotated] Xi
122 @ "[rotated]" means that
123 @ loaded value would have
124 @ to be rotated in order to
125 @ make it appear as in
Robert Sloanc6ebb282018-04-30 10:10:26 -0700126 @ algorithm specification
Adam Langleye9ada862015-05-11 17:20:37 -0700127 subs r3,r3,#32 @ see if r3 is 32 or larger
128 mov r12,#16 @ r12 is used as post-
129 @ increment for input pointer;
130 @ as loop is modulo-scheduled
131 @ r12 is zeroed just in time
Robert Sloanc6ebb282018-04-30 10:10:26 -0700132 @ to preclude overstepping
Adam Langleye9ada862015-05-11 17:20:37 -0700133 @ inp[len], which means that
134 @ last block[s] are actually
135 @ loaded twice, but last
136 @ copy is not processed
137 vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
138 vmov.i8 q11,#0xe1
139 vld1.64 {q14},[r1]
140 moveq r12,#0 @ is it time to zero r12?
141 vext.8 q0,q0,q0,#8 @ rotate Xi
142 vld1.64 {q8},[r2]! @ load [rotated] I[0]
143 vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
Adam Langleyd9e397b2015-01-22 14:27:53 -0800144#ifndef __ARMEB__
Adam Langleye9ada862015-05-11 17:20:37 -0700145 vrev64.8 q8,q8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800146 vrev64.8 q0,q0
Adam Langleye9ada862015-05-11 17:20:37 -0700147#endif
148 vext.8 q3,q8,q8,#8 @ rotate I[0]
149 blo .Lodd_tail_v8 @ r3 was less than 32
150 vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
151#ifndef __ARMEB__
Adam Langleyd9e397b2015-01-22 14:27:53 -0800152 vrev64.8 q9,q9
153#endif
Adam Langleye9ada862015-05-11 17:20:37 -0700154 vext.8 q7,q9,q9,#8
155 veor q3,q3,q0 @ I[i]^=Xi
Kenny Rootb8494592015-09-25 02:29:14 +0000156.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700157 veor q9,q9,q7 @ Karatsuba pre-processing
158.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
159 b .Loop_mod2x_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800160
161.align 4
Adam Langleye9ada862015-05-11 17:20:37 -0700162.Loop_mod2x_v8:
163 vext.8 q10,q3,q3,#8
164 subs r3,r3,#32 @ is there more data?
Kenny Rootb8494592015-09-25 02:29:14 +0000165.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700166 movlo r12,#0 @ is it time to zero r12?
Adam Langleyd9e397b2015-01-22 14:27:53 -0800167
Adam Langleye9ada862015-05-11 17:20:37 -0700168.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
169 veor q10,q10,q3 @ Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000170.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
Adam Langleye9ada862015-05-11 17:20:37 -0700171 veor q0,q0,q4 @ accumulate
Kenny Rootb8494592015-09-25 02:29:14 +0000172.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700173 vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800174
Adam Langleye9ada862015-05-11 17:20:37 -0700175 veor q2,q2,q6
176 moveq r12,#0 @ is it time to zero r12?
177 veor q1,q1,q5
Adam Langleyd9e397b2015-01-22 14:27:53 -0800178
Adam Langleye9ada862015-05-11 17:20:37 -0700179 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
180 veor q10,q0,q2
181 veor q1,q1,q9
182 vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800183#ifndef __ARMEB__
Adam Langleye9ada862015-05-11 17:20:37 -0700184 vrev64.8 q8,q8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800185#endif
Adam Langleye9ada862015-05-11 17:20:37 -0700186 veor q1,q1,q10
187.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
Adam Langleyd9e397b2015-01-22 14:27:53 -0800188
Adam Langleye9ada862015-05-11 17:20:37 -0700189#ifndef __ARMEB__
190 vrev64.8 q9,q9
191#endif
192 vmov d4,d3 @ Xh|Xm - 256-bit result
193 vmov d3,d0 @ Xm is rotated Xl
194 vext.8 q7,q9,q9,#8
195 vext.8 q3,q8,q8,#8
196 veor q0,q1,q10
Kenny Rootb8494592015-09-25 02:29:14 +0000197.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700198 veor q3,q3,q2 @ accumulate q3 early
Adam Langleyd9e397b2015-01-22 14:27:53 -0800199
Adam Langleye9ada862015-05-11 17:20:37 -0700200 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
201.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
202 veor q3,q3,q10
203 veor q9,q9,q7 @ Karatsuba pre-processing
204 veor q3,q3,q0
205.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
206 bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
207
208 veor q2,q2,q10
209 vext.8 q3,q8,q8,#8 @ re-construct q3
210 adds r3,r3,#32 @ re-construct r3
211 veor q0,q0,q2 @ re-construct q0
212 beq .Ldone_v8 @ is r3 zero?
213.Lodd_tail_v8:
214 vext.8 q10,q0,q0,#8
215 veor q3,q3,q0 @ inp^=Xi
216 veor q9,q8,q10 @ q9 is rotated inp^Xi
217
Kenny Rootb8494592015-09-25 02:29:14 +0000218.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700219 veor q9,q9,q3 @ Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000220.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
221.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700222
223 vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
224 veor q10,q0,q2
225 veor q1,q1,q9
226 veor q1,q1,q10
227.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
228
229 vmov d4,d3 @ Xh|Xm - 256-bit result
230 vmov d3,d0 @ Xm is rotated Xl
231 veor q0,q1,q10
232
233 vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
234.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
235 veor q10,q10,q2
236 veor q0,q0,q10
237
238.Ldone_v8:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800239#ifndef __ARMEB__
240 vrev64.8 q0,q0
241#endif
Adam Langleye9ada862015-05-11 17:20:37 -0700242 vext.8 q0,q0,q0,#8
243 vst1.64 {q0},[r0] @ write out Xi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800244
Adam Langleye9ada862015-05-11 17:20:37 -0700245 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
Adam Langleyd9e397b2015-01-22 14:27:53 -0800246 bx lr
247.size gcm_ghash_v8,.-gcm_ghash_v8
Adam Langleye9ada862015-05-11 17:20:37 -0700248.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
249.align 2
250.align 2
David Benjamin4969cc92016-04-22 15:02:23 -0400251#endif
Robert Sloan726e9d12018-09-11 11:45:04 -0700252#endif // !OPENSSL_NO_ASM