blob: 452bd630758c0cd745e881290ce025a1aa64afc2 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
11#
12# June 2014
13#
14# Initial version was developed in tight cooperation with Ard
15# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
16# other assembly modules. Just like aesv8-armx.pl this module
17# supports both AArch32 and AArch64 execution modes.
18#
Adam Langleye9ada862015-05-11 17:20:37 -070019# July 2014
20#
21# Implement 2x aggregated reduction [see ghash-x86.pl for background
22# information].
23#
Adam Langleyd9e397b2015-01-22 14:27:53 -080024# Current performance in cycles per processed byte:
25#
26# PMULL[2] 32-bit NEON(*)
Adam Langleye9ada862015-05-11 17:20:37 -070027# Apple A7 0.92 5.62
28# Cortex-A53 1.01 8.39
29# Cortex-A57 1.17 7.61
30# Denver 0.71 6.02
Adam Langleyd9e397b2015-01-22 14:27:53 -080031#
32# (*) presented for reference/comparison purposes;
33
34$flavour = shift;
Adam Langleye9ada862015-05-11 17:20:37 -070035$output = shift;
36
37$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan9254e682017-04-24 09:42:06 -070039( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langleye9ada862015-05-11 17:20:37 -070040die "can't locate arm-xlate.pl";
41
42open OUT,"| \"$^X\" $xlate $flavour $output";
43*STDOUT=*OUT;
Adam Langleyd9e397b2015-01-22 14:27:53 -080044
45$Xi="x0"; # argument block
46$Htbl="x1";
47$inp="x2";
48$len="x3";
49
50$inc="x12";
51
52{
53my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
Adam Langleye9ada862015-05-11 17:20:37 -070054my ($t0,$t1,$t2,$xC2,$H,$Hhl,$H2)=map("q$_",(8..14));
Adam Langleyd9e397b2015-01-22 14:27:53 -080055
56$code=<<___;
Kenny Rootb8494592015-09-25 02:29:14 +000057#include <openssl/arm_arch.h>
Adam Langleyd9e397b2015-01-22 14:27:53 -080058
59.text
60___
Adam Langleye9ada862015-05-11 17:20:37 -070061$code.=<<___ if ($flavour =~ /64/);
David Benjaminf0c4a6c2016-08-11 13:26:41 -040062#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
Adam Langleye9ada862015-05-11 17:20:37 -070063.arch armv8-a+crypto
Kenny Rootd18b6332015-04-18 14:27:55 -070064#endif
65___
Adam Langleyd9e397b2015-01-22 14:27:53 -080066$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
67
Adam Langleye9ada862015-05-11 17:20:37 -070068################################################################################
69# void gcm_init_v8(u128 Htable[16],const u64 H[2]);
70#
71# input: 128-bit H - secret parameter E(K,0^128)
72# output: precomputed table filled with degrees of twisted H;
73# H is twisted to handle reverse bitness of GHASH;
74# only few of 16 slots of Htable[16] are used;
75# data is opaque to outside world (which allows to
76# optimize the code independently);
77#
Adam Langleyd9e397b2015-01-22 14:27:53 -080078$code.=<<___;
79.global gcm_init_v8
80.type gcm_init_v8,%function
81.align 4
82gcm_init_v8:
Adam Langleye9ada862015-05-11 17:20:37 -070083 vld1.64 {$t1},[x1] @ load input H
84 vmov.i8 $xC2,#0xe1
85 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
Adam Langleyd9e397b2015-01-22 14:27:53 -080086 vext.8 $IN,$t1,$t1,#8
Adam Langleye9ada862015-05-11 17:20:37 -070087 vshr.u64 $t2,$xC2,#63
Adam Langleyd9e397b2015-01-22 14:27:53 -080088 vdup.32 $t1,${t1}[1]
Adam Langleye9ada862015-05-11 17:20:37 -070089 vext.8 $t0,$t2,$xC2,#8 @ t0=0xc2....01
90 vshr.u64 $t2,$IN,#63
Adam Langleyd9e397b2015-01-22 14:27:53 -080091 vshr.s32 $t1,$t1,#31 @ broadcast carry bit
Adam Langleye9ada862015-05-11 17:20:37 -070092 vand $t2,$t2,$t0
Adam Langleyd9e397b2015-01-22 14:27:53 -080093 vshl.i64 $IN,$IN,#1
Adam Langleye9ada862015-05-11 17:20:37 -070094 vext.8 $t2,$t2,$t2,#8
Adam Langleyd9e397b2015-01-22 14:27:53 -080095 vand $t0,$t0,$t1
Adam Langleye9ada862015-05-11 17:20:37 -070096 vorr $IN,$IN,$t2 @ H<<<=1
97 veor $H,$IN,$t0 @ twisted H
98 vst1.64 {$H},[x0],#16 @ store Htable[0]
99
100 @ calculate H^2
101 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing
102 vpmull.p64 $Xl,$H,$H
103 veor $t0,$t0,$H
104 vpmull2.p64 $Xh,$H,$H
105 vpmull.p64 $Xm,$t0,$t0
106
107 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
108 veor $t2,$Xl,$Xh
109 veor $Xm,$Xm,$t1
110 veor $Xm,$Xm,$t2
111 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase
112
113 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
114 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
115 veor $Xl,$Xm,$t2
116
117 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase
118 vpmull.p64 $Xl,$Xl,$xC2
119 veor $t2,$t2,$Xh
120 veor $H2,$Xl,$t2
121
122 vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing
123 veor $t1,$t1,$H2
124 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed
125 vst1.64 {$Hhl-$H2},[x0] @ store Htable[1..2]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800126
127 ret
128.size gcm_init_v8,.-gcm_init_v8
Adam Langleye9ada862015-05-11 17:20:37 -0700129___
130################################################################################
131# void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
132#
133# input: Xi - current hash value;
134# Htable - table precomputed in gcm_init_v8;
135# output: Xi - next hash value Xi;
136#
137$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -0800138.global gcm_gmult_v8
139.type gcm_gmult_v8,%function
140.align 4
141gcm_gmult_v8:
142 vld1.64 {$t1},[$Xi] @ load Xi
Adam Langleye9ada862015-05-11 17:20:37 -0700143 vmov.i8 $xC2,#0xe1
144 vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
145 vshl.u64 $xC2,$xC2,#57
Adam Langleyd9e397b2015-01-22 14:27:53 -0800146#ifndef __ARMEB__
147 vrev64.8 $t1,$t1
148#endif
Adam Langleyd9e397b2015-01-22 14:27:53 -0800149 vext.8 $IN,$t1,$t1,#8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800150
Kenny Rootb8494592015-09-25 02:29:14 +0000151 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
Adam Langleyd9e397b2015-01-22 14:27:53 -0800152 veor $t1,$t1,$IN @ Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000153 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
154 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800155
156 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
157 veor $t2,$Xl,$Xh
158 veor $Xm,$Xm,$t1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800159 veor $Xm,$Xm,$t2
Adam Langleye9ada862015-05-11 17:20:37 -0700160 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
Adam Langleyd9e397b2015-01-22 14:27:53 -0800161
162 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
163 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
Adam Langleyd9e397b2015-01-22 14:27:53 -0800164 veor $Xl,$Xm,$t2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800165
Adam Langleye9ada862015-05-11 17:20:37 -0700166 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
167 vpmull.p64 $Xl,$Xl,$xC2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800168 veor $t2,$t2,$Xh
169 veor $Xl,$Xl,$t2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800170
171#ifndef __ARMEB__
172 vrev64.8 $Xl,$Xl
173#endif
174 vext.8 $Xl,$Xl,$Xl,#8
175 vst1.64 {$Xl},[$Xi] @ write out Xi
176
177 ret
Adam Langleye9ada862015-05-11 17:20:37 -0700178.size gcm_gmult_v8,.-gcm_gmult_v8
179___
180################################################################################
181# void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
182#
183# input: table precomputed in gcm_init_v8;
184# current hash value Xi;
185# pointer to input data;
186# length of input data in bytes, but divisible by block size;
187# output: next hash value Xi;
188#
189$code.=<<___;
190.global gcm_ghash_v8
191.type gcm_ghash_v8,%function
192.align 4
193gcm_ghash_v8:
194___
195$code.=<<___ if ($flavour !~ /64/);
196 vstmdb sp!,{d8-d15} @ 32-bit ABI says so
197___
198$code.=<<___;
199 vld1.64 {$Xl},[$Xi] @ load [rotated] Xi
200 @ "[rotated]" means that
201 @ loaded value would have
202 @ to be rotated in order to
203 @ make it appear as in
204 @ alorithm specification
205 subs $len,$len,#32 @ see if $len is 32 or larger
206 mov $inc,#16 @ $inc is used as post-
207 @ increment for input pointer;
208 @ as loop is modulo-scheduled
209 @ $inc is zeroed just in time
210 @ to preclude oversteping
211 @ inp[len], which means that
212 @ last block[s] are actually
213 @ loaded twice, but last
214 @ copy is not processed
215 vld1.64 {$H-$Hhl},[$Htbl],#32 @ load twisted H, ..., H^2
216 vmov.i8 $xC2,#0xe1
217 vld1.64 {$H2},[$Htbl]
218 cclr $inc,eq @ is it time to zero $inc?
219 vext.8 $Xl,$Xl,$Xl,#8 @ rotate Xi
220 vld1.64 {$t0},[$inp],#16 @ load [rotated] I[0]
221 vshl.u64 $xC2,$xC2,#57 @ compose 0xc2.0 constant
222#ifndef __ARMEB__
223 vrev64.8 $t0,$t0
224 vrev64.8 $Xl,$Xl
225#endif
226 vext.8 $IN,$t0,$t0,#8 @ rotate I[0]
227 b.lo .Lodd_tail_v8 @ $len was less than 32
228___
229{ my ($Xln,$Xmn,$Xhn,$In) = map("q$_",(4..7));
230 #######
231 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
232 # [(H*Ii+1) + (H*Xi+1)] mod P =
233 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
234 #
235$code.=<<___;
236 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[1]
237#ifndef __ARMEB__
238 vrev64.8 $t1,$t1
239#endif
240 vext.8 $In,$t1,$t1,#8
241 veor $IN,$IN,$Xl @ I[i]^=Xi
Kenny Rootb8494592015-09-25 02:29:14 +0000242 vpmull.p64 $Xln,$H,$In @ H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700243 veor $t1,$t1,$In @ Karatsuba pre-processing
244 vpmull2.p64 $Xhn,$H,$In
245 b .Loop_mod2x_v8
246
247.align 4
248.Loop_mod2x_v8:
249 vext.8 $t2,$IN,$IN,#8
250 subs $len,$len,#32 @ is there more data?
Kenny Rootb8494592015-09-25 02:29:14 +0000251 vpmull.p64 $Xl,$H2,$IN @ H^2.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700252 cclr $inc,lo @ is it time to zero $inc?
253
254 vpmull.p64 $Xmn,$Hhl,$t1
255 veor $t2,$t2,$IN @ Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000256 vpmull2.p64 $Xh,$H2,$IN @ H^2.hi·Xi.hi
Adam Langleye9ada862015-05-11 17:20:37 -0700257 veor $Xl,$Xl,$Xln @ accumulate
Kenny Rootb8494592015-09-25 02:29:14 +0000258 vpmull2.p64 $Xm,$Hhl,$t2 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700259 vld1.64 {$t0},[$inp],$inc @ load [rotated] I[i+2]
260
261 veor $Xh,$Xh,$Xhn
262 cclr $inc,eq @ is it time to zero $inc?
263 veor $Xm,$Xm,$Xmn
264
265 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
266 veor $t2,$Xl,$Xh
267 veor $Xm,$Xm,$t1
268 vld1.64 {$t1},[$inp],$inc @ load [rotated] I[i+3]
269#ifndef __ARMEB__
270 vrev64.8 $t0,$t0
271#endif
272 veor $Xm,$Xm,$t2
273 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
274
275#ifndef __ARMEB__
276 vrev64.8 $t1,$t1
277#endif
278 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
279 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
280 vext.8 $In,$t1,$t1,#8
281 vext.8 $IN,$t0,$t0,#8
282 veor $Xl,$Xm,$t2
Kenny Rootb8494592015-09-25 02:29:14 +0000283 vpmull.p64 $Xln,$H,$In @ H·Ii+1
Adam Langleye9ada862015-05-11 17:20:37 -0700284 veor $IN,$IN,$Xh @ accumulate $IN early
285
286 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
287 vpmull.p64 $Xl,$Xl,$xC2
288 veor $IN,$IN,$t2
289 veor $t1,$t1,$In @ Karatsuba pre-processing
290 veor $IN,$IN,$Xl
291 vpmull2.p64 $Xhn,$H,$In
292 b.hs .Loop_mod2x_v8 @ there was at least 32 more bytes
293
294 veor $Xh,$Xh,$t2
295 vext.8 $IN,$t0,$t0,#8 @ re-construct $IN
296 adds $len,$len,#32 @ re-construct $len
297 veor $Xl,$Xl,$Xh @ re-construct $Xl
298 b.eq .Ldone_v8 @ is $len zero?
299___
300}
301$code.=<<___;
302.Lodd_tail_v8:
303 vext.8 $t2,$Xl,$Xl,#8
304 veor $IN,$IN,$Xl @ inp^=Xi
305 veor $t1,$t0,$t2 @ $t1 is rotated inp^Xi
306
Kenny Rootb8494592015-09-25 02:29:14 +0000307 vpmull.p64 $Xl,$H,$IN @ H.lo·Xi.lo
Adam Langleye9ada862015-05-11 17:20:37 -0700308 veor $t1,$t1,$IN @ Karatsuba pre-processing
Kenny Rootb8494592015-09-25 02:29:14 +0000309 vpmull2.p64 $Xh,$H,$IN @ H.hi·Xi.hi
310 vpmull.p64 $Xm,$Hhl,$t1 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
Adam Langleye9ada862015-05-11 17:20:37 -0700311
312 vext.8 $t1,$Xl,$Xh,#8 @ Karatsuba post-processing
313 veor $t2,$Xl,$Xh
314 veor $Xm,$Xm,$t1
315 veor $Xm,$Xm,$t2
316 vpmull.p64 $t2,$Xl,$xC2 @ 1st phase of reduction
317
318 vmov $Xh#lo,$Xm#hi @ Xh|Xm - 256-bit result
319 vmov $Xm#hi,$Xl#lo @ Xm is rotated Xl
320 veor $Xl,$Xm,$t2
321
322 vext.8 $t2,$Xl,$Xl,#8 @ 2nd phase of reduction
323 vpmull.p64 $Xl,$Xl,$xC2
324 veor $t2,$t2,$Xh
325 veor $Xl,$Xl,$t2
326
327.Ldone_v8:
328#ifndef __ARMEB__
329 vrev64.8 $Xl,$Xl
330#endif
331 vext.8 $Xl,$Xl,$Xl,#8
332 vst1.64 {$Xl},[$Xi] @ write out Xi
333
334___
335$code.=<<___ if ($flavour !~ /64/);
336 vldmia sp!,{d8-d15} @ 32-bit ABI says so
337___
338$code.=<<___;
339 ret
Adam Langleyd9e397b2015-01-22 14:27:53 -0800340.size gcm_ghash_v8,.-gcm_ghash_v8
341___
342}
343$code.=<<___;
344.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
345.align 2
346___
347
348if ($flavour =~ /64/) { ######## 64-bit code
349 sub unvmov {
350 my $arg=shift;
351
352 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
353 sprintf "ins v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
354 }
355 foreach(split("\n",$code)) {
356 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
357 s/vmov\.i8/movi/o or # fix up legacy mnemonics
358 s/vmov\s+(.*)/unvmov($1)/geo or
359 s/vext\.8/ext/o or
360 s/vshr\.s/sshr\.s/o or
361 s/vshr/ushr/o or
362 s/^(\s+)v/$1/o or # strip off v prefix
363 s/\bbx\s+lr\b/ret/o;
364
365 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
366 s/@\s/\/\//o; # old->new style commentary
367
368 # fix up remainig legacy suffixes
369 s/\.[ui]?8(\s)/$1/o;
370 s/\.[uis]?32//o and s/\.16b/\.4s/go;
371 m/\.p64/o and s/\.16b/\.1q/o; # 1st pmull argument
372 m/l\.p64/o and s/\.16b/\.1d/go; # 2nd and 3rd pmull arguments
373 s/\.[uisp]?64//o and s/\.16b/\.2d/go;
374 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
375
376 print $_,"\n";
377 }
378} else { ######## 32-bit code
379 sub unvdup32 {
380 my $arg=shift;
381
382 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
383 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
384 }
385 sub unvpmullp64 {
386 my ($mnemonic,$arg)=@_;
387
388 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
389 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
390 |(($2&7)<<17)|(($2&8)<<4)
391 |(($3&7)<<1) |(($3&8)<<2);
392 $word |= 0x00010001 if ($mnemonic =~ "2");
393 # since ARMv7 instructions are always encoded little-endian.
394 # correct solution is to use .inst directive, but older
395 # assemblers don't implement it:-(
396 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
397 $word&0xff,($word>>8)&0xff,
398 ($word>>16)&0xff,($word>>24)&0xff,
399 $mnemonic,$arg;
400 }
401 }
402
403 foreach(split("\n",$code)) {
404 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
405 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
Adam Langleye9ada862015-05-11 17:20:37 -0700406 s/\/\/\s?/@ /o; # new->old style commentary
Adam Langleyd9e397b2015-01-22 14:27:53 -0800407
408 # fix up remainig new-style suffixes
409 s/\],#[0-9]+/]!/o;
410
411 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
412 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
413 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
414 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
415 s/^(\s+)b\./$1b/o or
416 s/^(\s+)ret/$1bx\tlr/o;
417
Adam Langleye9ada862015-05-11 17:20:37 -0700418 print $_,"\n";
Adam Langleyd9e397b2015-01-22 14:27:53 -0800419 }
420}
421
422close STDOUT; # enforce flush