blob: 1267937bbf288f0e63d73562e871b01a34ff0071 [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Pete Bentley0c61efe2019-08-13 09:32:23 +01004#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
Robert Sloan726e9d12018-09-11 11:45:04 -07007#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
Robert Sloan726e9d12018-09-11 11:45:04 -070010
11#if !defined(OPENSSL_NO_ASM)
Kenny Rootb8494592015-09-25 02:29:14 +000012#if defined(__aarch64__)
Robert Sloan726e9d12018-09-11 11:45:04 -070013#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
Kenny Rootb8494592015-09-25 02:29:14 +000016#include <openssl/arm_arch.h>
Adam Langleyd9e397b2015-01-22 14:27:53 -080017
18.text
19.arch armv8-a+crypto
Adam Langleye9ada862015-05-11 17:20:37 -070020.globl gcm_init_v8
David Benjamin4969cc92016-04-22 15:02:23 -040021.hidden gcm_init_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -080022.type gcm_init_v8,%function
23.align 4
24gcm_init_v8:
Adam Langleye9ada862015-05-11 17:20:37 -070025 ld1 {v17.2d},[x1] //load input H
26 movi v19.16b,#0xe1
27 shl v19.2d,v19.2d,#57 //0xc2.0
28 ext v3.16b,v17.16b,v17.16b,#8
29 ushr v18.2d,v19.2d,#63
30 dup v17.4s,v17.s[1]
31 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
32 ushr v18.2d,v3.2d,#63
Adam Langleyd9e397b2015-01-22 14:27:53 -080033 sshr v17.4s,v17.4s,#31 //broadcast carry bit
Adam Langleye9ada862015-05-11 17:20:37 -070034 and v18.16b,v18.16b,v16.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -080035 shl v3.2d,v3.2d,#1
Adam Langleye9ada862015-05-11 17:20:37 -070036 ext v18.16b,v18.16b,v18.16b,#8
37 and v16.16b,v16.16b,v17.16b
38 orr v3.16b,v3.16b,v18.16b //H<<<=1
39 eor v20.16b,v3.16b,v16.16b //twisted H
40 st1 {v20.2d},[x0],#16 //store Htable[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -080041
Adam Langleye9ada862015-05-11 17:20:37 -070042 //calculate H^2
43 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
44 pmull v0.1q,v20.1d,v20.1d
45 eor v16.16b,v16.16b,v20.16b
46 pmull2 v2.1q,v20.2d,v20.2d
47 pmull v1.1q,v16.1d,v16.1d
Adam Langleyd9e397b2015-01-22 14:27:53 -080048
Adam Langleye9ada862015-05-11 17:20:37 -070049 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
50 eor v18.16b,v0.16b,v2.16b
51 eor v1.16b,v1.16b,v17.16b
52 eor v1.16b,v1.16b,v18.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -080053 pmull v18.1q,v0.1d,v19.1d //1st phase
54
55 ins v2.d[0],v1.d[1]
56 ins v1.d[1],v0.d[0]
Adam Langleye9ada862015-05-11 17:20:37 -070057 eor v0.16b,v1.16b,v18.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -080058
Adam Langleye9ada862015-05-11 17:20:37 -070059 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
Adam Langleyd9e397b2015-01-22 14:27:53 -080060 pmull v0.1q,v0.1d,v19.1d
Adam Langleye9ada862015-05-11 17:20:37 -070061 eor v18.16b,v18.16b,v2.16b
62 eor v22.16b,v0.16b,v18.16b
63
64 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
65 eor v17.16b,v17.16b,v22.16b
66 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
67 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2]
68
69 ret
70.size gcm_init_v8,.-gcm_init_v8
71.globl gcm_gmult_v8
David Benjamin4969cc92016-04-22 15:02:23 -040072.hidden gcm_gmult_v8
Adam Langleye9ada862015-05-11 17:20:37 -070073.type gcm_gmult_v8,%function
74.align 4
75gcm_gmult_v8:
76 ld1 {v17.2d},[x0] //load Xi
77 movi v19.16b,#0xe1
78 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
79 shl v19.2d,v19.2d,#57
80#ifndef __ARMEB__
81 rev64 v17.16b,v17.16b
82#endif
83 ext v3.16b,v17.16b,v17.16b,#8
84
Kenny Rootb8494592015-09-25 02:29:14 +000085 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -070086 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +000087 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
88 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -070089
90 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
91 eor v18.16b,v0.16b,v2.16b
92 eor v1.16b,v1.16b,v17.16b
93 eor v1.16b,v1.16b,v18.16b
94 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
95
96 ins v2.d[0],v1.d[1]
97 ins v1.d[1],v0.d[0]
98 eor v0.16b,v1.16b,v18.16b
99
100 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
101 pmull v0.1q,v0.1d,v19.1d
102 eor v18.16b,v18.16b,v2.16b
103 eor v0.16b,v0.16b,v18.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -0800104
105#ifndef __ARMEB__
106 rev64 v0.16b,v0.16b
107#endif
Adam Langleye9ada862015-05-11 17:20:37 -0700108 ext v0.16b,v0.16b,v0.16b,#8
109 st1 {v0.2d},[x0] //write out Xi
110
111 ret
112.size gcm_gmult_v8,.-gcm_gmult_v8
113.globl gcm_ghash_v8
David Benjamin4969cc92016-04-22 15:02:23 -0400114.hidden gcm_ghash_v8
Adam Langleye9ada862015-05-11 17:20:37 -0700115.type gcm_ghash_v8,%function
116.align 4
117gcm_ghash_v8:
118 ld1 {v0.2d},[x0] //load [rotated] Xi
119 //"[rotated]" means that
120 //loaded value would have
121 //to be rotated in order to
122 //make it appear as in
Robert Sloanc6ebb282018-04-30 10:10:26 -0700123 //algorithm specification
Adam Langleye9ada862015-05-11 17:20:37 -0700124 subs x3,x3,#32 //see if x3 is 32 or larger
125 mov x12,#16 //x12 is used as post-
126 //increment for input pointer;
127 //as loop is modulo-scheduled
128 //x12 is zeroed just in time
Robert Sloanc6ebb282018-04-30 10:10:26 -0700129 //to preclude overstepping
Adam Langleye9ada862015-05-11 17:20:37 -0700130 //inp[len], which means that
131 //last block[s] are actually
132 //loaded twice, but last
133 //copy is not processed
134 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
135 movi v19.16b,#0xe1
136 ld1 {v22.2d},[x1]
137 csel x12,xzr,x12,eq //is it time to zero x12?
138 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
139 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
140 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
141#ifndef __ARMEB__
142 rev64 v16.16b,v16.16b
143 rev64 v0.16b,v0.16b
144#endif
145 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
146 b.lo .Lodd_tail_v8 //x3 was less than 32
147 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
148#ifndef __ARMEB__
149 rev64 v17.16b,v17.16b
150#endif
151 ext v7.16b,v17.16b,v17.16b,#8
152 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
Kenny Rootb8494592015-09-25 02:29:14 +0000153 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700154 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
155 pmull2 v6.1q,v20.2d,v7.2d
156 b .Loop_mod2x_v8
157
158.align 4
159.Loop_mod2x_v8:
160 ext v18.16b,v3.16b,v3.16b,#8
161 subs x3,x3,#32 //is there more data?
Kenny Rootb8494592015-09-25 02:29:14 +0000162 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700163 csel x12,xzr,x12,lo //is it time to zero x12?
164
165 pmull v5.1q,v21.1d,v17.1d
166 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000167 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
Adam Langleye9ada862015-05-11 17:20:37 -0700168 eor v0.16b,v0.16b,v4.16b //accumulate
Kenny Rootb8494592015-09-25 02:29:14 +0000169 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700170 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
171
172 eor v2.16b,v2.16b,v6.16b
173 csel x12,xzr,x12,eq //is it time to zero x12?
174 eor v1.16b,v1.16b,v5.16b
175
176 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
177 eor v18.16b,v0.16b,v2.16b
178 eor v1.16b,v1.16b,v17.16b
179 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
180#ifndef __ARMEB__
181 rev64 v16.16b,v16.16b
182#endif
183 eor v1.16b,v1.16b,v18.16b
184 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
185
186#ifndef __ARMEB__
187 rev64 v17.16b,v17.16b
188#endif
189 ins v2.d[0],v1.d[1]
190 ins v1.d[1],v0.d[0]
191 ext v7.16b,v17.16b,v17.16b,#8
192 ext v3.16b,v16.16b,v16.16b,#8
193 eor v0.16b,v1.16b,v18.16b
Kenny Rootb8494592015-09-25 02:29:14 +0000194 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700195 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
196
197 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
198 pmull v0.1q,v0.1d,v19.1d
199 eor v3.16b,v3.16b,v18.16b
200 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
201 eor v3.16b,v3.16b,v0.16b
202 pmull2 v6.1q,v20.2d,v7.2d
203 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
204
205 eor v2.16b,v2.16b,v18.16b
206 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
207 adds x3,x3,#32 //re-construct x3
208 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
209 b.eq .Ldone_v8 //is x3 zero?
210.Lodd_tail_v8:
211 ext v18.16b,v0.16b,v0.16b,#8
212 eor v3.16b,v3.16b,v0.16b //inp^=Xi
213 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
214
Kenny Rootb8494592015-09-25 02:29:14 +0000215 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700216 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000217 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
218 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700219
220 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
221 eor v18.16b,v0.16b,v2.16b
222 eor v1.16b,v1.16b,v17.16b
223 eor v1.16b,v1.16b,v18.16b
224 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
225
226 ins v2.d[0],v1.d[1]
227 ins v1.d[1],v0.d[0]
228 eor v0.16b,v1.16b,v18.16b
229
230 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
231 pmull v0.1q,v0.1d,v19.1d
232 eor v18.16b,v18.16b,v2.16b
233 eor v0.16b,v0.16b,v18.16b
234
235.Ldone_v8:
236#ifndef __ARMEB__
237 rev64 v0.16b,v0.16b
238#endif
239 ext v0.16b,v0.16b,v0.16b,#8
240 st1 {v0.2d},[x0] //write out Xi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800241
242 ret
243.size gcm_ghash_v8,.-gcm_ghash_v8
Adam Langleye9ada862015-05-11 17:20:37 -0700244.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
245.align 2
246.align 2
David Benjamin4969cc92016-04-22 15:02:23 -0400247#endif
Robert Sloan726e9d12018-09-11 11:45:04 -0700248#endif // !OPENSSL_NO_ASM