blob: 60bff31018697eb2e294b2c619f64b87256a802d [file] [log] [blame]
Robert Sloan9d5d1a72019-03-18 09:32:50 -07001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Pete Bentley0c61efe2019-08-13 09:32:23 +01004#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
Robert Sloan9d5d1a72019-03-18 09:32:50 -07007#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
Robert Sloan9d5d1a72019-03-18 09:32:50 -070010
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15.text
16
17.globl _gcm_init_neon
18.private_extern _gcm_init_neon
19
20.align 4
21_gcm_init_neon:
22 // This function is adapted from gcm_init_v8. xC2 is t3.
23 ld1 {v17.2d}, [x1] // load H
24 movi v19.16b, #0xe1
25 shl v19.2d, v19.2d, #57 // 0xc2.0
26 ext v3.16b, v17.16b, v17.16b, #8
27 ushr v18.2d, v19.2d, #63
28 dup v17.4s, v17.s[1]
29 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
30 ushr v18.2d, v3.2d, #63
31 sshr v17.4s, v17.4s, #31 // broadcast carry bit
32 and v18.16b, v18.16b, v16.16b
33 shl v3.2d, v3.2d, #1
34 ext v18.16b, v18.16b, v18.16b, #8
35 and v16.16b, v16.16b, v17.16b
36 orr v3.16b, v3.16b, v18.16b // H<<<=1
37 eor v5.16b, v3.16b, v16.16b // twisted H
38 st1 {v5.2d}, [x0] // store Htable[0]
39 ret
40
41
42.globl _gcm_gmult_neon
43.private_extern _gcm_gmult_neon
44
45.align 4
46_gcm_gmult_neon:
47 ld1 {v3.16b}, [x0] // load Xi
48 ld1 {v5.1d}, [x1], #8 // load twisted H
49 ld1 {v6.1d}, [x1]
50 adrp x9, Lmasks@PAGE // load constants
51 add x9, x9, Lmasks@PAGEOFF
52 ld1 {v24.2d, v25.2d}, [x9]
53 rev64 v3.16b, v3.16b // byteswap Xi
54 ext v3.16b, v3.16b, v3.16b, #8
55 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
56
57 mov x3, #16
58 b Lgmult_neon
59
60
61.globl _gcm_ghash_neon
62.private_extern _gcm_ghash_neon
63
64.align 4
65_gcm_ghash_neon:
66 ld1 {v0.16b}, [x0] // load Xi
67 ld1 {v5.1d}, [x1], #8 // load twisted H
68 ld1 {v6.1d}, [x1]
69 adrp x9, Lmasks@PAGE // load constants
70 add x9, x9, Lmasks@PAGEOFF
71 ld1 {v24.2d, v25.2d}, [x9]
72 rev64 v0.16b, v0.16b // byteswap Xi
73 ext v0.16b, v0.16b, v0.16b, #8
74 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
75
76Loop_neon:
77 ld1 {v3.16b}, [x2], #16 // load inp
78 rev64 v3.16b, v3.16b // byteswap inp
79 ext v3.16b, v3.16b, v3.16b, #8
80 eor v3.16b, v3.16b, v0.16b // inp ^= Xi
81
82Lgmult_neon:
83 // Split the input into v3 and v4. (The upper halves are unused,
84 // so it is okay to leave them alone.)
85 ins v4.d[0], v3.d[1]
86 ext v16.8b, v5.8b, v5.8b, #1 // A1
87 pmull v16.8h, v16.8b, v3.8b // F = A1*B
88 ext v0.8b, v3.8b, v3.8b, #1 // B1
89 pmull v0.8h, v5.8b, v0.8b // E = A*B1
90 ext v17.8b, v5.8b, v5.8b, #2 // A2
91 pmull v17.8h, v17.8b, v3.8b // H = A2*B
92 ext v19.8b, v3.8b, v3.8b, #2 // B2
93 pmull v19.8h, v5.8b, v19.8b // G = A*B2
94 ext v18.8b, v5.8b, v5.8b, #3 // A3
95 eor v16.16b, v16.16b, v0.16b // L = E + F
96 pmull v18.8h, v18.8b, v3.8b // J = A3*B
97 ext v0.8b, v3.8b, v3.8b, #3 // B3
98 eor v17.16b, v17.16b, v19.16b // M = G + H
99 pmull v0.8h, v5.8b, v0.8b // I = A*B3
100
101 // Here we diverge from the 32-bit version. It computes the following
102 // (instructions reordered for clarity):
103 //
104 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
105 // vand $t0#hi, $t0#hi, $k48
106 // veor $t0#lo, $t0#lo, $t0#hi
107 //
108 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
109 // vand $t1#hi, $t1#hi, $k32
110 // veor $t1#lo, $t1#lo, $t1#hi
111 //
112 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
113 // vand $t2#hi, $t2#hi, $k16
114 // veor $t2#lo, $t2#lo, $t2#hi
115 //
116 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
117 // vmov.i64 $t3#hi, #0
118 //
119 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
120 // upper halves of SIMD registers, so we must split each half into
121 // separate registers. To compensate, we pair computations up and
122 // parallelize.
123
124 ext v19.8b, v3.8b, v3.8b, #4 // B4
125 eor v18.16b, v18.16b, v0.16b // N = I + J
126 pmull v19.8h, v5.8b, v19.8b // K = A*B4
127
128 // This can probably be scheduled more efficiently. For now, we just
129 // pair up independent instructions.
130 zip1 v20.2d, v16.2d, v17.2d
131 zip1 v22.2d, v18.2d, v19.2d
132 zip2 v21.2d, v16.2d, v17.2d
133 zip2 v23.2d, v18.2d, v19.2d
134 eor v20.16b, v20.16b, v21.16b
135 eor v22.16b, v22.16b, v23.16b
136 and v21.16b, v21.16b, v24.16b
137 and v23.16b, v23.16b, v25.16b
138 eor v20.16b, v20.16b, v21.16b
139 eor v22.16b, v22.16b, v23.16b
140 zip1 v16.2d, v20.2d, v21.2d
141 zip1 v18.2d, v22.2d, v23.2d
142 zip2 v17.2d, v20.2d, v21.2d
143 zip2 v19.2d, v22.2d, v23.2d
144
145 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
146 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
147 pmull v0.8h, v5.8b, v3.8b // D = A*B
148 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
149 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
150 eor v16.16b, v16.16b, v17.16b
151 eor v18.16b, v18.16b, v19.16b
152 eor v0.16b, v0.16b, v16.16b
153 eor v0.16b, v0.16b, v18.16b
154 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
155 ext v16.8b, v7.8b, v7.8b, #1 // A1
156 pmull v16.8h, v16.8b, v3.8b // F = A1*B
157 ext v1.8b, v3.8b, v3.8b, #1 // B1
158 pmull v1.8h, v7.8b, v1.8b // E = A*B1
159 ext v17.8b, v7.8b, v7.8b, #2 // A2
160 pmull v17.8h, v17.8b, v3.8b // H = A2*B
161 ext v19.8b, v3.8b, v3.8b, #2 // B2
162 pmull v19.8h, v7.8b, v19.8b // G = A*B2
163 ext v18.8b, v7.8b, v7.8b, #3 // A3
164 eor v16.16b, v16.16b, v1.16b // L = E + F
165 pmull v18.8h, v18.8b, v3.8b // J = A3*B
166 ext v1.8b, v3.8b, v3.8b, #3 // B3
167 eor v17.16b, v17.16b, v19.16b // M = G + H
168 pmull v1.8h, v7.8b, v1.8b // I = A*B3
169
170 // Here we diverge from the 32-bit version. It computes the following
171 // (instructions reordered for clarity):
172 //
173 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
174 // vand $t0#hi, $t0#hi, $k48
175 // veor $t0#lo, $t0#lo, $t0#hi
176 //
177 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
178 // vand $t1#hi, $t1#hi, $k32
179 // veor $t1#lo, $t1#lo, $t1#hi
180 //
181 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
182 // vand $t2#hi, $t2#hi, $k16
183 // veor $t2#lo, $t2#lo, $t2#hi
184 //
185 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
186 // vmov.i64 $t3#hi, #0
187 //
188 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
189 // upper halves of SIMD registers, so we must split each half into
190 // separate registers. To compensate, we pair computations up and
191 // parallelize.
192
193 ext v19.8b, v3.8b, v3.8b, #4 // B4
194 eor v18.16b, v18.16b, v1.16b // N = I + J
195 pmull v19.8h, v7.8b, v19.8b // K = A*B4
196
197 // This can probably be scheduled more efficiently. For now, we just
198 // pair up independent instructions.
199 zip1 v20.2d, v16.2d, v17.2d
200 zip1 v22.2d, v18.2d, v19.2d
201 zip2 v21.2d, v16.2d, v17.2d
202 zip2 v23.2d, v18.2d, v19.2d
203 eor v20.16b, v20.16b, v21.16b
204 eor v22.16b, v22.16b, v23.16b
205 and v21.16b, v21.16b, v24.16b
206 and v23.16b, v23.16b, v25.16b
207 eor v20.16b, v20.16b, v21.16b
208 eor v22.16b, v22.16b, v23.16b
209 zip1 v16.2d, v20.2d, v21.2d
210 zip1 v18.2d, v22.2d, v23.2d
211 zip2 v17.2d, v20.2d, v21.2d
212 zip2 v19.2d, v22.2d, v23.2d
213
214 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
215 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
216 pmull v1.8h, v7.8b, v3.8b // D = A*B
217 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
218 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
219 eor v16.16b, v16.16b, v17.16b
220 eor v18.16b, v18.16b, v19.16b
221 eor v1.16b, v1.16b, v16.16b
222 eor v1.16b, v1.16b, v18.16b
223 ext v16.8b, v6.8b, v6.8b, #1 // A1
224 pmull v16.8h, v16.8b, v4.8b // F = A1*B
225 ext v2.8b, v4.8b, v4.8b, #1 // B1
226 pmull v2.8h, v6.8b, v2.8b // E = A*B1
227 ext v17.8b, v6.8b, v6.8b, #2 // A2
228 pmull v17.8h, v17.8b, v4.8b // H = A2*B
229 ext v19.8b, v4.8b, v4.8b, #2 // B2
230 pmull v19.8h, v6.8b, v19.8b // G = A*B2
231 ext v18.8b, v6.8b, v6.8b, #3 // A3
232 eor v16.16b, v16.16b, v2.16b // L = E + F
233 pmull v18.8h, v18.8b, v4.8b // J = A3*B
234 ext v2.8b, v4.8b, v4.8b, #3 // B3
235 eor v17.16b, v17.16b, v19.16b // M = G + H
236 pmull v2.8h, v6.8b, v2.8b // I = A*B3
237
238 // Here we diverge from the 32-bit version. It computes the following
239 // (instructions reordered for clarity):
240 //
241 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
242 // vand $t0#hi, $t0#hi, $k48
243 // veor $t0#lo, $t0#lo, $t0#hi
244 //
245 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
246 // vand $t1#hi, $t1#hi, $k32
247 // veor $t1#lo, $t1#lo, $t1#hi
248 //
249 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
250 // vand $t2#hi, $t2#hi, $k16
251 // veor $t2#lo, $t2#lo, $t2#hi
252 //
253 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
254 // vmov.i64 $t3#hi, #0
255 //
256 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
257 // upper halves of SIMD registers, so we must split each half into
258 // separate registers. To compensate, we pair computations up and
259 // parallelize.
260
261 ext v19.8b, v4.8b, v4.8b, #4 // B4
262 eor v18.16b, v18.16b, v2.16b // N = I + J
263 pmull v19.8h, v6.8b, v19.8b // K = A*B4
264
265 // This can probably be scheduled more efficiently. For now, we just
266 // pair up independent instructions.
267 zip1 v20.2d, v16.2d, v17.2d
268 zip1 v22.2d, v18.2d, v19.2d
269 zip2 v21.2d, v16.2d, v17.2d
270 zip2 v23.2d, v18.2d, v19.2d
271 eor v20.16b, v20.16b, v21.16b
272 eor v22.16b, v22.16b, v23.16b
273 and v21.16b, v21.16b, v24.16b
274 and v23.16b, v23.16b, v25.16b
275 eor v20.16b, v20.16b, v21.16b
276 eor v22.16b, v22.16b, v23.16b
277 zip1 v16.2d, v20.2d, v21.2d
278 zip1 v18.2d, v22.2d, v23.2d
279 zip2 v17.2d, v20.2d, v21.2d
280 zip2 v19.2d, v22.2d, v23.2d
281
282 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
283 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
284 pmull v2.8h, v6.8b, v4.8b // D = A*B
285 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
286 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
287 eor v16.16b, v16.16b, v17.16b
288 eor v18.16b, v18.16b, v19.16b
289 eor v2.16b, v2.16b, v16.16b
290 eor v2.16b, v2.16b, v18.16b
291 ext v16.16b, v0.16b, v2.16b, #8
292 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
293 eor v1.16b, v1.16b, v2.16b
294 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
295 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
296 // This is a no-op due to the ins instruction below.
297 // ins v2.d[0], v1.d[1]
298
299 // equivalent of reduction_avx from ghash-x86_64.pl
300 shl v17.2d, v0.2d, #57 // 1st phase
301 shl v18.2d, v0.2d, #62
302 eor v18.16b, v18.16b, v17.16b //
303 shl v17.2d, v0.2d, #63
304 eor v18.16b, v18.16b, v17.16b //
305 // Note Xm contains {Xl.d[1], Xh.d[0]}.
306 eor v18.16b, v18.16b, v1.16b
307 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
308 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
309
310 ushr v18.2d, v0.2d, #1 // 2nd phase
311 eor v2.16b, v2.16b,v0.16b
312 eor v0.16b, v0.16b,v18.16b //
313 ushr v18.2d, v18.2d, #6
314 ushr v0.2d, v0.2d, #1 //
315 eor v0.16b, v0.16b, v2.16b //
316 eor v0.16b, v0.16b, v18.16b //
317
318 subs x3, x3, #16
319 bne Loop_neon
320
321 rev64 v0.16b, v0.16b // byteswap Xi and write
322 ext v0.16b, v0.16b, v0.16b, #8
323 st1 {v0.16b}, [x0]
324
325 ret
326
327
328.section __TEXT,__const
329.align 4
330Lmasks:
331.quad 0x0000ffffffffffff // k48
332.quad 0x00000000ffffffff // k32
333.quad 0x000000000000ffff // k16
334.quad 0x0000000000000000 // k0
335.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
336.align 2
337.align 2
338#endif // !OPENSSL_NO_ASM