blob: 01d32321cb2252f3106d2d8770da94022083fb7e [file] [log] [blame]
Robert Sloan8ff03552017-06-14 12:40:58 -07001#include <openssl/arm_arch.h>
2
3.text
4#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
5
6#endif
7.globl _gcm_init_v8
8.private_extern _gcm_init_v8
9
10.align 4
11_gcm_init_v8:
12 ld1 {v17.2d},[x1] //load input H
13 movi v19.16b,#0xe1
14 shl v19.2d,v19.2d,#57 //0xc2.0
15 ext v3.16b,v17.16b,v17.16b,#8
16 ushr v18.2d,v19.2d,#63
17 dup v17.4s,v17.s[1]
18 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
19 ushr v18.2d,v3.2d,#63
20 sshr v17.4s,v17.4s,#31 //broadcast carry bit
21 and v18.16b,v18.16b,v16.16b
22 shl v3.2d,v3.2d,#1
23 ext v18.16b,v18.16b,v18.16b,#8
24 and v16.16b,v16.16b,v17.16b
25 orr v3.16b,v3.16b,v18.16b //H<<<=1
26 eor v20.16b,v3.16b,v16.16b //twisted H
27 st1 {v20.2d},[x0],#16 //store Htable[0]
28
29 //calculate H^2
30 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
31 pmull v0.1q,v20.1d,v20.1d
32 eor v16.16b,v16.16b,v20.16b
33 pmull2 v2.1q,v20.2d,v20.2d
34 pmull v1.1q,v16.1d,v16.1d
35
36 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
37 eor v18.16b,v0.16b,v2.16b
38 eor v1.16b,v1.16b,v17.16b
39 eor v1.16b,v1.16b,v18.16b
40 pmull v18.1q,v0.1d,v19.1d //1st phase
41
42 ins v2.d[0],v1.d[1]
43 ins v1.d[1],v0.d[0]
44 eor v0.16b,v1.16b,v18.16b
45
46 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
47 pmull v0.1q,v0.1d,v19.1d
48 eor v18.16b,v18.16b,v2.16b
49 eor v22.16b,v0.16b,v18.16b
50
51 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
52 eor v17.16b,v17.16b,v22.16b
53 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
54 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2]
55
56 ret
57
58.globl _gcm_gmult_v8
59.private_extern _gcm_gmult_v8
60
61.align 4
62_gcm_gmult_v8:
63 ld1 {v17.2d},[x0] //load Xi
64 movi v19.16b,#0xe1
65 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
66 shl v19.2d,v19.2d,#57
67#ifndef __ARMEB__
68 rev64 v17.16b,v17.16b
69#endif
70 ext v3.16b,v17.16b,v17.16b,#8
71
72 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
73 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
74 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
75 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
76
77 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
78 eor v18.16b,v0.16b,v2.16b
79 eor v1.16b,v1.16b,v17.16b
80 eor v1.16b,v1.16b,v18.16b
81 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
82
83 ins v2.d[0],v1.d[1]
84 ins v1.d[1],v0.d[0]
85 eor v0.16b,v1.16b,v18.16b
86
87 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
88 pmull v0.1q,v0.1d,v19.1d
89 eor v18.16b,v18.16b,v2.16b
90 eor v0.16b,v0.16b,v18.16b
91
92#ifndef __ARMEB__
93 rev64 v0.16b,v0.16b
94#endif
95 ext v0.16b,v0.16b,v0.16b,#8
96 st1 {v0.2d},[x0] //write out Xi
97
98 ret
99
100.globl _gcm_ghash_v8
101.private_extern _gcm_ghash_v8
102
103.align 4
104_gcm_ghash_v8:
105 ld1 {v0.2d},[x0] //load [rotated] Xi
106 //"[rotated]" means that
107 //loaded value would have
108 //to be rotated in order to
109 //make it appear as in
110 //alorithm specification
111 subs x3,x3,#32 //see if x3 is 32 or larger
112 mov x12,#16 //x12 is used as post-
113 //increment for input pointer;
114 //as loop is modulo-scheduled
115 //x12 is zeroed just in time
116 //to preclude oversteping
117 //inp[len], which means that
118 //last block[s] are actually
119 //loaded twice, but last
120 //copy is not processed
121 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
122 movi v19.16b,#0xe1
123 ld1 {v22.2d},[x1]
124 csel x12,xzr,x12,eq //is it time to zero x12?
125 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
126 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
127 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
128#ifndef __ARMEB__
129 rev64 v16.16b,v16.16b
130 rev64 v0.16b,v0.16b
131#endif
132 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
133 b.lo Lodd_tail_v8 //x3 was less than 32
134 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
135#ifndef __ARMEB__
136 rev64 v17.16b,v17.16b
137#endif
138 ext v7.16b,v17.16b,v17.16b,#8
139 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
140 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
141 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
142 pmull2 v6.1q,v20.2d,v7.2d
143 b Loop_mod2x_v8
144
145.align 4
146Loop_mod2x_v8:
147 ext v18.16b,v3.16b,v3.16b,#8
148 subs x3,x3,#32 //is there more data?
149 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
150 csel x12,xzr,x12,lo //is it time to zero x12?
151
152 pmull v5.1q,v21.1d,v17.1d
153 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
154 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
155 eor v0.16b,v0.16b,v4.16b //accumulate
156 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
157 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
158
159 eor v2.16b,v2.16b,v6.16b
160 csel x12,xzr,x12,eq //is it time to zero x12?
161 eor v1.16b,v1.16b,v5.16b
162
163 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
164 eor v18.16b,v0.16b,v2.16b
165 eor v1.16b,v1.16b,v17.16b
166 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
167#ifndef __ARMEB__
168 rev64 v16.16b,v16.16b
169#endif
170 eor v1.16b,v1.16b,v18.16b
171 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
172
173#ifndef __ARMEB__
174 rev64 v17.16b,v17.16b
175#endif
176 ins v2.d[0],v1.d[1]
177 ins v1.d[1],v0.d[0]
178 ext v7.16b,v17.16b,v17.16b,#8
179 ext v3.16b,v16.16b,v16.16b,#8
180 eor v0.16b,v1.16b,v18.16b
181 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
182 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
183
184 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
185 pmull v0.1q,v0.1d,v19.1d
186 eor v3.16b,v3.16b,v18.16b
187 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
188 eor v3.16b,v3.16b,v0.16b
189 pmull2 v6.1q,v20.2d,v7.2d
190 b.hs Loop_mod2x_v8 //there was at least 32 more bytes
191
192 eor v2.16b,v2.16b,v18.16b
193 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
194 adds x3,x3,#32 //re-construct x3
195 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
196 b.eq Ldone_v8 //is x3 zero?
197Lodd_tail_v8:
198 ext v18.16b,v0.16b,v0.16b,#8
199 eor v3.16b,v3.16b,v0.16b //inp^=Xi
200 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
201
202 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
203 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
204 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
205 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
206
207 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
208 eor v18.16b,v0.16b,v2.16b
209 eor v1.16b,v1.16b,v17.16b
210 eor v1.16b,v1.16b,v18.16b
211 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
212
213 ins v2.d[0],v1.d[1]
214 ins v1.d[1],v0.d[0]
215 eor v0.16b,v1.16b,v18.16b
216
217 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
218 pmull v0.1q,v0.1d,v19.1d
219 eor v18.16b,v18.16b,v2.16b
220 eor v0.16b,v0.16b,v18.16b
221
222Ldone_v8:
223#ifndef __ARMEB__
224 rev64 v0.16b,v0.16b
225#endif
226 ext v0.16b,v0.16b,v0.16b,#8
227 st1 {v0.2d},[x0] //write out Xi
228
229 ret
230
231.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
232.align 2
233.align 2