blob: be0e283c366d7dfa2405b137a9a4bd9c89379f31 [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Pete Bentley0c61efe2019-08-13 09:32:23 +01004#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07007#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
Robert Sloan726e9d12018-09-11 11:45:04 -070010
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
Robert Sloan8ff03552017-06-14 12:40:58 -070015#include <openssl/arm_arch.h>
16
17.text
Robert Sloan8ff03552017-06-14 12:40:58 -070018
Robert Sloan8ff03552017-06-14 12:40:58 -070019.globl _gcm_init_v8
20.private_extern _gcm_init_v8
21
22.align 4
23_gcm_init_v8:
24 ld1 {v17.2d},[x1] //load input H
25 movi v19.16b,#0xe1
26 shl v19.2d,v19.2d,#57 //0xc2.0
27 ext v3.16b,v17.16b,v17.16b,#8
28 ushr v18.2d,v19.2d,#63
29 dup v17.4s,v17.s[1]
30 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
31 ushr v18.2d,v3.2d,#63
32 sshr v17.4s,v17.4s,#31 //broadcast carry bit
33 and v18.16b,v18.16b,v16.16b
34 shl v3.2d,v3.2d,#1
35 ext v18.16b,v18.16b,v18.16b,#8
36 and v16.16b,v16.16b,v17.16b
37 orr v3.16b,v3.16b,v18.16b //H<<<=1
38 eor v20.16b,v3.16b,v16.16b //twisted H
39 st1 {v20.2d},[x0],#16 //store Htable[0]
40
41 //calculate H^2
42 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
43 pmull v0.1q,v20.1d,v20.1d
44 eor v16.16b,v16.16b,v20.16b
45 pmull2 v2.1q,v20.2d,v20.2d
46 pmull v1.1q,v16.1d,v16.1d
47
48 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
49 eor v18.16b,v0.16b,v2.16b
50 eor v1.16b,v1.16b,v17.16b
51 eor v1.16b,v1.16b,v18.16b
52 pmull v18.1q,v0.1d,v19.1d //1st phase
53
54 ins v2.d[0],v1.d[1]
55 ins v1.d[1],v0.d[0]
56 eor v0.16b,v1.16b,v18.16b
57
58 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
59 pmull v0.1q,v0.1d,v19.1d
60 eor v18.16b,v18.16b,v2.16b
61 eor v22.16b,v0.16b,v18.16b
62
63 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
64 eor v17.16b,v17.16b,v22.16b
65 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
66 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2]
67
68 ret
69
70.globl _gcm_gmult_v8
71.private_extern _gcm_gmult_v8
72
73.align 4
74_gcm_gmult_v8:
75 ld1 {v17.2d},[x0] //load Xi
76 movi v19.16b,#0xe1
77 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
78 shl v19.2d,v19.2d,#57
79#ifndef __ARMEB__
80 rev64 v17.16b,v17.16b
81#endif
82 ext v3.16b,v17.16b,v17.16b,#8
83
84 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
85 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
86 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
87 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
88
89 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
90 eor v18.16b,v0.16b,v2.16b
91 eor v1.16b,v1.16b,v17.16b
92 eor v1.16b,v1.16b,v18.16b
93 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
94
95 ins v2.d[0],v1.d[1]
96 ins v1.d[1],v0.d[0]
97 eor v0.16b,v1.16b,v18.16b
98
99 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
100 pmull v0.1q,v0.1d,v19.1d
101 eor v18.16b,v18.16b,v2.16b
102 eor v0.16b,v0.16b,v18.16b
103
104#ifndef __ARMEB__
105 rev64 v0.16b,v0.16b
106#endif
107 ext v0.16b,v0.16b,v0.16b,#8
108 st1 {v0.2d},[x0] //write out Xi
109
110 ret
111
112.globl _gcm_ghash_v8
113.private_extern _gcm_ghash_v8
114
115.align 4
116_gcm_ghash_v8:
117 ld1 {v0.2d},[x0] //load [rotated] Xi
118 //"[rotated]" means that
119 //loaded value would have
120 //to be rotated in order to
121 //make it appear as in
Robert Sloanc6ebb282018-04-30 10:10:26 -0700122 //algorithm specification
Robert Sloan8ff03552017-06-14 12:40:58 -0700123 subs x3,x3,#32 //see if x3 is 32 or larger
124 mov x12,#16 //x12 is used as post-
125 //increment for input pointer;
126 //as loop is modulo-scheduled
127 //x12 is zeroed just in time
Robert Sloanc6ebb282018-04-30 10:10:26 -0700128 //to preclude overstepping
Robert Sloan8ff03552017-06-14 12:40:58 -0700129 //inp[len], which means that
130 //last block[s] are actually
131 //loaded twice, but last
132 //copy is not processed
133 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
134 movi v19.16b,#0xe1
135 ld1 {v22.2d},[x1]
136 csel x12,xzr,x12,eq //is it time to zero x12?
137 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
138 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
139 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
140#ifndef __ARMEB__
141 rev64 v16.16b,v16.16b
142 rev64 v0.16b,v0.16b
143#endif
144 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
145 b.lo Lodd_tail_v8 //x3 was less than 32
146 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
147#ifndef __ARMEB__
148 rev64 v17.16b,v17.16b
149#endif
150 ext v7.16b,v17.16b,v17.16b,#8
151 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
152 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
153 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
154 pmull2 v6.1q,v20.2d,v7.2d
155 b Loop_mod2x_v8
156
157.align 4
158Loop_mod2x_v8:
159 ext v18.16b,v3.16b,v3.16b,#8
160 subs x3,x3,#32 //is there more data?
161 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
162 csel x12,xzr,x12,lo //is it time to zero x12?
163
164 pmull v5.1q,v21.1d,v17.1d
165 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
166 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
167 eor v0.16b,v0.16b,v4.16b //accumulate
168 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
169 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
170
171 eor v2.16b,v2.16b,v6.16b
172 csel x12,xzr,x12,eq //is it time to zero x12?
173 eor v1.16b,v1.16b,v5.16b
174
175 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
176 eor v18.16b,v0.16b,v2.16b
177 eor v1.16b,v1.16b,v17.16b
178 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
179#ifndef __ARMEB__
180 rev64 v16.16b,v16.16b
181#endif
182 eor v1.16b,v1.16b,v18.16b
183 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
184
185#ifndef __ARMEB__
186 rev64 v17.16b,v17.16b
187#endif
188 ins v2.d[0],v1.d[1]
189 ins v1.d[1],v0.d[0]
190 ext v7.16b,v17.16b,v17.16b,#8
191 ext v3.16b,v16.16b,v16.16b,#8
192 eor v0.16b,v1.16b,v18.16b
193 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
194 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
195
196 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
197 pmull v0.1q,v0.1d,v19.1d
198 eor v3.16b,v3.16b,v18.16b
199 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
200 eor v3.16b,v3.16b,v0.16b
201 pmull2 v6.1q,v20.2d,v7.2d
202 b.hs Loop_mod2x_v8 //there was at least 32 more bytes
203
204 eor v2.16b,v2.16b,v18.16b
205 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
206 adds x3,x3,#32 //re-construct x3
207 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
208 b.eq Ldone_v8 //is x3 zero?
209Lodd_tail_v8:
210 ext v18.16b,v0.16b,v0.16b,#8
211 eor v3.16b,v3.16b,v0.16b //inp^=Xi
212 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
213
214 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
215 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
216 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
217 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
218
219 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
220 eor v18.16b,v0.16b,v2.16b
221 eor v1.16b,v1.16b,v17.16b
222 eor v1.16b,v1.16b,v18.16b
223 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
224
225 ins v2.d[0],v1.d[1]
226 ins v1.d[1],v0.d[0]
227 eor v0.16b,v1.16b,v18.16b
228
229 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
230 pmull v0.1q,v0.1d,v19.1d
231 eor v18.16b,v18.16b,v2.16b
232 eor v0.16b,v0.16b,v18.16b
233
234Ldone_v8:
235#ifndef __ARMEB__
236 rev64 v0.16b,v0.16b
237#endif
238 ext v0.16b,v0.16b,v0.16b,#8
239 st1 {v0.2d},[x0] //write out Xi
240
241 ret
242
243.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
244.align 2
245.align 2
Robert Sloan726e9d12018-09-11 11:45:04 -0700246#endif // !OPENSSL_NO_ASM