blob: 62bdc9a89d240aca1fc60bb5b5904061470cf234 [file] [log] [blame]
Robert Sloan9d5d1a72019-03-18 09:32:50 -07001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
11#if defined(BORINGSSL_PREFIX)
12#include <boringssl_prefix_symbols_asm.h>
13#endif
14.text
15
16.globl _gcm_init_neon
17.private_extern _gcm_init_neon
18
19.align 4
20_gcm_init_neon:
21 // This function is adapted from gcm_init_v8. xC2 is t3.
22 ld1 {v17.2d}, [x1] // load H
23 movi v19.16b, #0xe1
24 shl v19.2d, v19.2d, #57 // 0xc2.0
25 ext v3.16b, v17.16b, v17.16b, #8
26 ushr v18.2d, v19.2d, #63
27 dup v17.4s, v17.s[1]
28 ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
29 ushr v18.2d, v3.2d, #63
30 sshr v17.4s, v17.4s, #31 // broadcast carry bit
31 and v18.16b, v18.16b, v16.16b
32 shl v3.2d, v3.2d, #1
33 ext v18.16b, v18.16b, v18.16b, #8
34 and v16.16b, v16.16b, v17.16b
35 orr v3.16b, v3.16b, v18.16b // H<<<=1
36 eor v5.16b, v3.16b, v16.16b // twisted H
37 st1 {v5.2d}, [x0] // store Htable[0]
38 ret
39
40
41.globl _gcm_gmult_neon
42.private_extern _gcm_gmult_neon
43
44.align 4
45_gcm_gmult_neon:
46 ld1 {v3.16b}, [x0] // load Xi
47 ld1 {v5.1d}, [x1], #8 // load twisted H
48 ld1 {v6.1d}, [x1]
49 adrp x9, Lmasks@PAGE // load constants
50 add x9, x9, Lmasks@PAGEOFF
51 ld1 {v24.2d, v25.2d}, [x9]
52 rev64 v3.16b, v3.16b // byteswap Xi
53 ext v3.16b, v3.16b, v3.16b, #8
54 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
55
56 mov x3, #16
57 b Lgmult_neon
58
59
60.globl _gcm_ghash_neon
61.private_extern _gcm_ghash_neon
62
63.align 4
64_gcm_ghash_neon:
65 ld1 {v0.16b}, [x0] // load Xi
66 ld1 {v5.1d}, [x1], #8 // load twisted H
67 ld1 {v6.1d}, [x1]
68 adrp x9, Lmasks@PAGE // load constants
69 add x9, x9, Lmasks@PAGEOFF
70 ld1 {v24.2d, v25.2d}, [x9]
71 rev64 v0.16b, v0.16b // byteswap Xi
72 ext v0.16b, v0.16b, v0.16b, #8
73 eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
74
75Loop_neon:
76 ld1 {v3.16b}, [x2], #16 // load inp
77 rev64 v3.16b, v3.16b // byteswap inp
78 ext v3.16b, v3.16b, v3.16b, #8
79 eor v3.16b, v3.16b, v0.16b // inp ^= Xi
80
81Lgmult_neon:
82 // Split the input into v3 and v4. (The upper halves are unused,
83 // so it is okay to leave them alone.)
84 ins v4.d[0], v3.d[1]
85 ext v16.8b, v5.8b, v5.8b, #1 // A1
86 pmull v16.8h, v16.8b, v3.8b // F = A1*B
87 ext v0.8b, v3.8b, v3.8b, #1 // B1
88 pmull v0.8h, v5.8b, v0.8b // E = A*B1
89 ext v17.8b, v5.8b, v5.8b, #2 // A2
90 pmull v17.8h, v17.8b, v3.8b // H = A2*B
91 ext v19.8b, v3.8b, v3.8b, #2 // B2
92 pmull v19.8h, v5.8b, v19.8b // G = A*B2
93 ext v18.8b, v5.8b, v5.8b, #3 // A3
94 eor v16.16b, v16.16b, v0.16b // L = E + F
95 pmull v18.8h, v18.8b, v3.8b // J = A3*B
96 ext v0.8b, v3.8b, v3.8b, #3 // B3
97 eor v17.16b, v17.16b, v19.16b // M = G + H
98 pmull v0.8h, v5.8b, v0.8b // I = A*B3
99
100 // Here we diverge from the 32-bit version. It computes the following
101 // (instructions reordered for clarity):
102 //
103 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
104 // vand $t0#hi, $t0#hi, $k48
105 // veor $t0#lo, $t0#lo, $t0#hi
106 //
107 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
108 // vand $t1#hi, $t1#hi, $k32
109 // veor $t1#lo, $t1#lo, $t1#hi
110 //
111 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
112 // vand $t2#hi, $t2#hi, $k16
113 // veor $t2#lo, $t2#lo, $t2#hi
114 //
115 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
116 // vmov.i64 $t3#hi, #0
117 //
118 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
119 // upper halves of SIMD registers, so we must split each half into
120 // separate registers. To compensate, we pair computations up and
121 // parallelize.
122
123 ext v19.8b, v3.8b, v3.8b, #4 // B4
124 eor v18.16b, v18.16b, v0.16b // N = I + J
125 pmull v19.8h, v5.8b, v19.8b // K = A*B4
126
127 // This can probably be scheduled more efficiently. For now, we just
128 // pair up independent instructions.
129 zip1 v20.2d, v16.2d, v17.2d
130 zip1 v22.2d, v18.2d, v19.2d
131 zip2 v21.2d, v16.2d, v17.2d
132 zip2 v23.2d, v18.2d, v19.2d
133 eor v20.16b, v20.16b, v21.16b
134 eor v22.16b, v22.16b, v23.16b
135 and v21.16b, v21.16b, v24.16b
136 and v23.16b, v23.16b, v25.16b
137 eor v20.16b, v20.16b, v21.16b
138 eor v22.16b, v22.16b, v23.16b
139 zip1 v16.2d, v20.2d, v21.2d
140 zip1 v18.2d, v22.2d, v23.2d
141 zip2 v17.2d, v20.2d, v21.2d
142 zip2 v19.2d, v22.2d, v23.2d
143
144 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
145 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
146 pmull v0.8h, v5.8b, v3.8b // D = A*B
147 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
148 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
149 eor v16.16b, v16.16b, v17.16b
150 eor v18.16b, v18.16b, v19.16b
151 eor v0.16b, v0.16b, v16.16b
152 eor v0.16b, v0.16b, v18.16b
153 eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
154 ext v16.8b, v7.8b, v7.8b, #1 // A1
155 pmull v16.8h, v16.8b, v3.8b // F = A1*B
156 ext v1.8b, v3.8b, v3.8b, #1 // B1
157 pmull v1.8h, v7.8b, v1.8b // E = A*B1
158 ext v17.8b, v7.8b, v7.8b, #2 // A2
159 pmull v17.8h, v17.8b, v3.8b // H = A2*B
160 ext v19.8b, v3.8b, v3.8b, #2 // B2
161 pmull v19.8h, v7.8b, v19.8b // G = A*B2
162 ext v18.8b, v7.8b, v7.8b, #3 // A3
163 eor v16.16b, v16.16b, v1.16b // L = E + F
164 pmull v18.8h, v18.8b, v3.8b // J = A3*B
165 ext v1.8b, v3.8b, v3.8b, #3 // B3
166 eor v17.16b, v17.16b, v19.16b // M = G + H
167 pmull v1.8h, v7.8b, v1.8b // I = A*B3
168
169 // Here we diverge from the 32-bit version. It computes the following
170 // (instructions reordered for clarity):
171 //
172 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
173 // vand $t0#hi, $t0#hi, $k48
174 // veor $t0#lo, $t0#lo, $t0#hi
175 //
176 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
177 // vand $t1#hi, $t1#hi, $k32
178 // veor $t1#lo, $t1#lo, $t1#hi
179 //
180 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
181 // vand $t2#hi, $t2#hi, $k16
182 // veor $t2#lo, $t2#lo, $t2#hi
183 //
184 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
185 // vmov.i64 $t3#hi, #0
186 //
187 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
188 // upper halves of SIMD registers, so we must split each half into
189 // separate registers. To compensate, we pair computations up and
190 // parallelize.
191
192 ext v19.8b, v3.8b, v3.8b, #4 // B4
193 eor v18.16b, v18.16b, v1.16b // N = I + J
194 pmull v19.8h, v7.8b, v19.8b // K = A*B4
195
196 // This can probably be scheduled more efficiently. For now, we just
197 // pair up independent instructions.
198 zip1 v20.2d, v16.2d, v17.2d
199 zip1 v22.2d, v18.2d, v19.2d
200 zip2 v21.2d, v16.2d, v17.2d
201 zip2 v23.2d, v18.2d, v19.2d
202 eor v20.16b, v20.16b, v21.16b
203 eor v22.16b, v22.16b, v23.16b
204 and v21.16b, v21.16b, v24.16b
205 and v23.16b, v23.16b, v25.16b
206 eor v20.16b, v20.16b, v21.16b
207 eor v22.16b, v22.16b, v23.16b
208 zip1 v16.2d, v20.2d, v21.2d
209 zip1 v18.2d, v22.2d, v23.2d
210 zip2 v17.2d, v20.2d, v21.2d
211 zip2 v19.2d, v22.2d, v23.2d
212
213 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
214 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
215 pmull v1.8h, v7.8b, v3.8b // D = A*B
216 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
217 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
218 eor v16.16b, v16.16b, v17.16b
219 eor v18.16b, v18.16b, v19.16b
220 eor v1.16b, v1.16b, v16.16b
221 eor v1.16b, v1.16b, v18.16b
222 ext v16.8b, v6.8b, v6.8b, #1 // A1
223 pmull v16.8h, v16.8b, v4.8b // F = A1*B
224 ext v2.8b, v4.8b, v4.8b, #1 // B1
225 pmull v2.8h, v6.8b, v2.8b // E = A*B1
226 ext v17.8b, v6.8b, v6.8b, #2 // A2
227 pmull v17.8h, v17.8b, v4.8b // H = A2*B
228 ext v19.8b, v4.8b, v4.8b, #2 // B2
229 pmull v19.8h, v6.8b, v19.8b // G = A*B2
230 ext v18.8b, v6.8b, v6.8b, #3 // A3
231 eor v16.16b, v16.16b, v2.16b // L = E + F
232 pmull v18.8h, v18.8b, v4.8b // J = A3*B
233 ext v2.8b, v4.8b, v4.8b, #3 // B3
234 eor v17.16b, v17.16b, v19.16b // M = G + H
235 pmull v2.8h, v6.8b, v2.8b // I = A*B3
236
237 // Here we diverge from the 32-bit version. It computes the following
238 // (instructions reordered for clarity):
239 //
240 // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
241 // vand $t0#hi, $t0#hi, $k48
242 // veor $t0#lo, $t0#lo, $t0#hi
243 //
244 // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
245 // vand $t1#hi, $t1#hi, $k32
246 // veor $t1#lo, $t1#lo, $t1#hi
247 //
248 // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
249 // vand $t2#hi, $t2#hi, $k16
250 // veor $t2#lo, $t2#lo, $t2#hi
251 //
252 // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
253 // vmov.i64 $t3#hi, #0
254 //
255 // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
256 // upper halves of SIMD registers, so we must split each half into
257 // separate registers. To compensate, we pair computations up and
258 // parallelize.
259
260 ext v19.8b, v4.8b, v4.8b, #4 // B4
261 eor v18.16b, v18.16b, v2.16b // N = I + J
262 pmull v19.8h, v6.8b, v19.8b // K = A*B4
263
264 // This can probably be scheduled more efficiently. For now, we just
265 // pair up independent instructions.
266 zip1 v20.2d, v16.2d, v17.2d
267 zip1 v22.2d, v18.2d, v19.2d
268 zip2 v21.2d, v16.2d, v17.2d
269 zip2 v23.2d, v18.2d, v19.2d
270 eor v20.16b, v20.16b, v21.16b
271 eor v22.16b, v22.16b, v23.16b
272 and v21.16b, v21.16b, v24.16b
273 and v23.16b, v23.16b, v25.16b
274 eor v20.16b, v20.16b, v21.16b
275 eor v22.16b, v22.16b, v23.16b
276 zip1 v16.2d, v20.2d, v21.2d
277 zip1 v18.2d, v22.2d, v23.2d
278 zip2 v17.2d, v20.2d, v21.2d
279 zip2 v19.2d, v22.2d, v23.2d
280
281 ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
282 ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
283 pmull v2.8h, v6.8b, v4.8b // D = A*B
284 ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
285 ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
286 eor v16.16b, v16.16b, v17.16b
287 eor v18.16b, v18.16b, v19.16b
288 eor v2.16b, v2.16b, v16.16b
289 eor v2.16b, v2.16b, v18.16b
290 ext v16.16b, v0.16b, v2.16b, #8
291 eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
292 eor v1.16b, v1.16b, v2.16b
293 eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
294 ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
295 // This is a no-op due to the ins instruction below.
296 // ins v2.d[0], v1.d[1]
297
298 // equivalent of reduction_avx from ghash-x86_64.pl
299 shl v17.2d, v0.2d, #57 // 1st phase
300 shl v18.2d, v0.2d, #62
301 eor v18.16b, v18.16b, v17.16b //
302 shl v17.2d, v0.2d, #63
303 eor v18.16b, v18.16b, v17.16b //
304 // Note Xm contains {Xl.d[1], Xh.d[0]}.
305 eor v18.16b, v18.16b, v1.16b
306 ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
307 ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
308
309 ushr v18.2d, v0.2d, #1 // 2nd phase
310 eor v2.16b, v2.16b,v0.16b
311 eor v0.16b, v0.16b,v18.16b //
312 ushr v18.2d, v18.2d, #6
313 ushr v0.2d, v0.2d, #1 //
314 eor v0.16b, v0.16b, v2.16b //
315 eor v0.16b, v0.16b, v18.16b //
316
317 subs x3, x3, #16
318 bne Loop_neon
319
320 rev64 v0.16b, v0.16b // byteswap Xi and write
321 ext v0.16b, v0.16b, v0.16b, #8
322 st1 {v0.16b}, [x0]
323
324 ret
325
326
327.section __TEXT,__const
328.align 4
329Lmasks:
330.quad 0x0000ffffffffffff // k48
331.quad 0x00000000ffffffff // k32
332.quad 0x000000000000ffff // k16
333.quad 0x0000000000000000 // k0
334.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
335.align 2
336.align 2
337#endif // !OPENSSL_NO_ASM