blob: a6113243fbdb2ff4d656785043357b65b2f1a679 [file] [log] [blame]
Robert Sloanc9abfe42018-11-26 12:19:07 -08001// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
Robert Sloan726e9d12018-09-11 11:45:04 -07004#if defined(__has_feature)
5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
6#define OPENSSL_NO_ASM
7#endif
8#endif
9
10#if !defined(OPENSSL_NO_ASM)
Kenny Rootb8494592015-09-25 02:29:14 +000011#if defined(__aarch64__)
Robert Sloan726e9d12018-09-11 11:45:04 -070012#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
Kenny Rootb8494592015-09-25 02:29:14 +000015#include <openssl/arm_arch.h>
Adam Langleyd9e397b2015-01-22 14:27:53 -080016
17.text
18.arch armv8-a+crypto
Adam Langleye9ada862015-05-11 17:20:37 -070019.globl gcm_init_v8
David Benjamin4969cc92016-04-22 15:02:23 -040020.hidden gcm_init_v8
Adam Langleyd9e397b2015-01-22 14:27:53 -080021.type gcm_init_v8,%function
22.align 4
23gcm_init_v8:
Adam Langleye9ada862015-05-11 17:20:37 -070024 ld1 {v17.2d},[x1] //load input H
25 movi v19.16b,#0xe1
26 shl v19.2d,v19.2d,#57 //0xc2.0
27 ext v3.16b,v17.16b,v17.16b,#8
28 ushr v18.2d,v19.2d,#63
29 dup v17.4s,v17.s[1]
30 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
31 ushr v18.2d,v3.2d,#63
Adam Langleyd9e397b2015-01-22 14:27:53 -080032 sshr v17.4s,v17.4s,#31 //broadcast carry bit
Adam Langleye9ada862015-05-11 17:20:37 -070033 and v18.16b,v18.16b,v16.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -080034 shl v3.2d,v3.2d,#1
Adam Langleye9ada862015-05-11 17:20:37 -070035 ext v18.16b,v18.16b,v18.16b,#8
36 and v16.16b,v16.16b,v17.16b
37 orr v3.16b,v3.16b,v18.16b //H<<<=1
38 eor v20.16b,v3.16b,v16.16b //twisted H
39 st1 {v20.2d},[x0],#16 //store Htable[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -080040
Adam Langleye9ada862015-05-11 17:20:37 -070041 //calculate H^2
42 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
43 pmull v0.1q,v20.1d,v20.1d
44 eor v16.16b,v16.16b,v20.16b
45 pmull2 v2.1q,v20.2d,v20.2d
46 pmull v1.1q,v16.1d,v16.1d
Adam Langleyd9e397b2015-01-22 14:27:53 -080047
Adam Langleye9ada862015-05-11 17:20:37 -070048 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
49 eor v18.16b,v0.16b,v2.16b
50 eor v1.16b,v1.16b,v17.16b
51 eor v1.16b,v1.16b,v18.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -080052 pmull v18.1q,v0.1d,v19.1d //1st phase
53
54 ins v2.d[0],v1.d[1]
55 ins v1.d[1],v0.d[0]
Adam Langleye9ada862015-05-11 17:20:37 -070056 eor v0.16b,v1.16b,v18.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -080057
Adam Langleye9ada862015-05-11 17:20:37 -070058 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
Adam Langleyd9e397b2015-01-22 14:27:53 -080059 pmull v0.1q,v0.1d,v19.1d
Adam Langleye9ada862015-05-11 17:20:37 -070060 eor v18.16b,v18.16b,v2.16b
61 eor v22.16b,v0.16b,v18.16b
62
63 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
64 eor v17.16b,v17.16b,v22.16b
65 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
66 st1 {v21.2d,v22.2d},[x0] //store Htable[1..2]
67
68 ret
69.size gcm_init_v8,.-gcm_init_v8
70.globl gcm_gmult_v8
David Benjamin4969cc92016-04-22 15:02:23 -040071.hidden gcm_gmult_v8
Adam Langleye9ada862015-05-11 17:20:37 -070072.type gcm_gmult_v8,%function
73.align 4
74gcm_gmult_v8:
75 ld1 {v17.2d},[x0] //load Xi
76 movi v19.16b,#0xe1
77 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
78 shl v19.2d,v19.2d,#57
79#ifndef __ARMEB__
80 rev64 v17.16b,v17.16b
81#endif
82 ext v3.16b,v17.16b,v17.16b,#8
83
Kenny Rootb8494592015-09-25 02:29:14 +000084 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -070085 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +000086 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
87 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -070088
89 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
90 eor v18.16b,v0.16b,v2.16b
91 eor v1.16b,v1.16b,v17.16b
92 eor v1.16b,v1.16b,v18.16b
93 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
94
95 ins v2.d[0],v1.d[1]
96 ins v1.d[1],v0.d[0]
97 eor v0.16b,v1.16b,v18.16b
98
99 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
100 pmull v0.1q,v0.1d,v19.1d
101 eor v18.16b,v18.16b,v2.16b
102 eor v0.16b,v0.16b,v18.16b
Adam Langleyd9e397b2015-01-22 14:27:53 -0800103
104#ifndef __ARMEB__
105 rev64 v0.16b,v0.16b
106#endif
Adam Langleye9ada862015-05-11 17:20:37 -0700107 ext v0.16b,v0.16b,v0.16b,#8
108 st1 {v0.2d},[x0] //write out Xi
109
110 ret
111.size gcm_gmult_v8,.-gcm_gmult_v8
112.globl gcm_ghash_v8
David Benjamin4969cc92016-04-22 15:02:23 -0400113.hidden gcm_ghash_v8
Adam Langleye9ada862015-05-11 17:20:37 -0700114.type gcm_ghash_v8,%function
115.align 4
116gcm_ghash_v8:
117 ld1 {v0.2d},[x0] //load [rotated] Xi
118 //"[rotated]" means that
119 //loaded value would have
120 //to be rotated in order to
121 //make it appear as in
Robert Sloanc6ebb282018-04-30 10:10:26 -0700122 //algorithm specification
Adam Langleye9ada862015-05-11 17:20:37 -0700123 subs x3,x3,#32 //see if x3 is 32 or larger
124 mov x12,#16 //x12 is used as post-
125 //increment for input pointer;
126 //as loop is modulo-scheduled
127 //x12 is zeroed just in time
Robert Sloanc6ebb282018-04-30 10:10:26 -0700128 //to preclude overstepping
Adam Langleye9ada862015-05-11 17:20:37 -0700129 //inp[len], which means that
130 //last block[s] are actually
131 //loaded twice, but last
132 //copy is not processed
133 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
134 movi v19.16b,#0xe1
135 ld1 {v22.2d},[x1]
136 csel x12,xzr,x12,eq //is it time to zero x12?
137 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
138 ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
139 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
140#ifndef __ARMEB__
141 rev64 v16.16b,v16.16b
142 rev64 v0.16b,v0.16b
143#endif
144 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
145 b.lo .Lodd_tail_v8 //x3 was less than 32
146 ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
147#ifndef __ARMEB__
148 rev64 v17.16b,v17.16b
149#endif
150 ext v7.16b,v17.16b,v17.16b,#8
151 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
Kenny Rootb8494592015-09-25 02:29:14 +0000152 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700153 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
154 pmull2 v6.1q,v20.2d,v7.2d
155 b .Loop_mod2x_v8
156
157.align 4
158.Loop_mod2x_v8:
159 ext v18.16b,v3.16b,v3.16b,#8
160 subs x3,x3,#32 //is there more data?
Kenny Rootb8494592015-09-25 02:29:14 +0000161 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700162 csel x12,xzr,x12,lo //is it time to zero x12?
163
164 pmull v5.1q,v21.1d,v17.1d
165 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000166 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
Adam Langleye9ada862015-05-11 17:20:37 -0700167 eor v0.16b,v0.16b,v4.16b //accumulate
Kenny Rootb8494592015-09-25 02:29:14 +0000168 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700169 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
170
171 eor v2.16b,v2.16b,v6.16b
172 csel x12,xzr,x12,eq //is it time to zero x12?
173 eor v1.16b,v1.16b,v5.16b
174
175 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
176 eor v18.16b,v0.16b,v2.16b
177 eor v1.16b,v1.16b,v17.16b
178 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
179#ifndef __ARMEB__
180 rev64 v16.16b,v16.16b
181#endif
182 eor v1.16b,v1.16b,v18.16b
183 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
184
185#ifndef __ARMEB__
186 rev64 v17.16b,v17.16b
187#endif
188 ins v2.d[0],v1.d[1]
189 ins v1.d[1],v0.d[0]
190 ext v7.16b,v17.16b,v17.16b,#8
191 ext v3.16b,v16.16b,v16.16b,#8
192 eor v0.16b,v1.16b,v18.16b
Kenny Rootb8494592015-09-25 02:29:14 +0000193 pmull v4.1q,v20.1d,v7.1d //H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700194 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
195
196 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
197 pmull v0.1q,v0.1d,v19.1d
198 eor v3.16b,v3.16b,v18.16b
199 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
200 eor v3.16b,v3.16b,v0.16b
201 pmull2 v6.1q,v20.2d,v7.2d
202 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
203
204 eor v2.16b,v2.16b,v18.16b
205 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
206 adds x3,x3,#32 //re-construct x3
207 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
208 b.eq .Ldone_v8 //is x3 zero?
209.Lodd_tail_v8:
210 ext v18.16b,v0.16b,v0.16b,#8
211 eor v3.16b,v3.16b,v0.16b //inp^=Xi
212 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
213
Kenny Rootb8494592015-09-25 02:29:14 +0000214 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700215 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000216 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
217 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700218
219 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
220 eor v18.16b,v0.16b,v2.16b
221 eor v1.16b,v1.16b,v17.16b
222 eor v1.16b,v1.16b,v18.16b
223 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
224
225 ins v2.d[0],v1.d[1]
226 ins v1.d[1],v0.d[0]
227 eor v0.16b,v1.16b,v18.16b
228
229 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
230 pmull v0.1q,v0.1d,v19.1d
231 eor v18.16b,v18.16b,v2.16b
232 eor v0.16b,v0.16b,v18.16b
233
234.Ldone_v8:
235#ifndef __ARMEB__
236 rev64 v0.16b,v0.16b
237#endif
238 ext v0.16b,v0.16b,v0.16b,#8
239 st1 {v0.2d},[x0] //write out Xi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800240
241 ret
242.size gcm_ghash_v8,.-gcm_ghash_v8
Adam Langleye9ada862015-05-11 17:20:37 -0700243.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
244.align 2
245.align 2
David Benjamin4969cc92016-04-22 15:02:23 -0400246#endif
Robert Sloan726e9d12018-09-11 11:45:04 -0700247#endif // !OPENSSL_NO_ASM