blob: 183fe60bb1791052976c406ce9dcd0f4543a15a1 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# April 2010
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+32 bytes shared table]. There is no
15# experimental performance data available yet. The only approximation
16# that can be made at this point is based on code size. Inner loop is
17# 32 instructions long and on single-issue core should execute in <40
18# cycles. Having verified that gcc 3.4 didn't unroll corresponding
19# loop, this assembler loop body was found to be ~3x smaller than
20# compiler-generated one...
21#
22# July 2010
23#
24# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
25# Cortex A8 core and ~25 cycles per processed byte (which was observed
26# to be ~3 times faster than gcc-generated code:-)
27#
28# February 2011
29#
30# Profiler-assisted and platform-specific optimization resulted in 7%
31# improvement on Cortex A8 core and ~23.5 cycles per byte.
32#
33# March 2011
34#
35# Add NEON implementation featuring polynomial multiplication, i.e. no
36# lookup tables involved. On Cortex A8 it was measured to process one
37# byte in 15 cycles or 55% faster than integer-only code.
38#
39# April 2014
40#
41# Switch to multiplication algorithm suggested in paper referred
42# below and combine it with reduction algorithm from x86 module.
43# Performance improvement over previous version varies from 65% on
44# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
Adam Langleye9ada862015-05-11 17:20:37 -070045# processes one byte in 8.45 cycles, A9 - in 10.2, A15 - in 7.63,
46# Snapdragon S4 - in 9.33.
Adam Langleyd9e397b2015-01-22 14:27:53 -080047#
Kenny Rootb8494592015-09-25 02:29:14 +000048# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
Adam Langleyd9e397b2015-01-22 14:27:53 -080049# Polynomial Multiplication on ARM Processors using the NEON Engine.
Robert Sloana94fe052017-02-21 08:49:28 -080050#
Adam Langleyd9e397b2015-01-22 14:27:53 -080051# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
52
53# ====================================================================
54# Note about "528B" variant. In ARM case it makes lesser sense to
55# implement it for following reasons:
56#
57# - performance improvement won't be anywhere near 50%, because 128-
58# bit shift operation is neatly fused with 128-bit xor here, and
59# "538B" variant would eliminate only 4-5 instructions out of 32
60# in the inner loop (meaning that estimated improvement is ~15%);
61# - ARM-based systems are often embedded ones and extra memory
62# consumption might be unappreciated (for so little improvement);
63#
64# Byte order [in]dependence. =========================================
65#
66# Caller is expected to maintain specific *dword* order in Htable,
67# namely with *least* significant dword of 128-bit value at *lower*
68# address. This differs completely from C code and has everything to
69# do with ldm instruction and order in which dwords are "consumed" by
70# algorithm. *Byte* order within these dwords in turn is whatever
71# *native* byte order on current platform. See gcm128.c for working
72# example...
73
Adam Langleye9ada862015-05-11 17:20:37 -070074$flavour = shift;
David Benjaminc895d6b2016-08-11 13:26:41 -040075if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
76else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
Adam Langleye9ada862015-05-11 17:20:37 -070077
78if ($flavour && $flavour ne "void") {
79 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
80 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
Robert Sloan9254e682017-04-24 09:42:06 -070081 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
Adam Langleye9ada862015-05-11 17:20:37 -070082 die "can't locate arm-xlate.pl";
83
84 open STDOUT,"| \"$^X\" $xlate $flavour $output";
85} else {
86 open STDOUT,">$output";
87}
Adam Langleyd9e397b2015-01-22 14:27:53 -080088
89$Xi="r0"; # argument block
90$Htbl="r1";
91$inp="r2";
92$len="r3";
93
94$Zll="r4"; # variables
95$Zlh="r5";
96$Zhl="r6";
97$Zhh="r7";
98$Tll="r8";
99$Tlh="r9";
100$Thl="r10";
101$Thh="r11";
102$nlo="r12";
103################# r13 is stack pointer
104$nhi="r14";
105################# r15 is program counter
106
107$rem_4bit=$inp; # used in gcm_gmult_4bit
108$cnt=$len;
109
110sub Zsmash() {
111 my $i=12;
112 my @args=@_;
113 for ($Zll,$Zlh,$Zhl,$Zhh) {
114 $code.=<<___;
115#if __ARM_ARCH__>=7 && defined(__ARMEL__)
116 rev $_,$_
117 str $_,[$Xi,#$i]
118#elif defined(__ARMEB__)
119 str $_,[$Xi,#$i]
120#else
121 mov $Tlh,$_,lsr#8
122 strb $_,[$Xi,#$i+3]
123 mov $Thl,$_,lsr#16
124 strb $Tlh,[$Xi,#$i+2]
125 mov $Thh,$_,lsr#24
126 strb $Thl,[$Xi,#$i+1]
127 strb $Thh,[$Xi,#$i]
128#endif
129___
130 $code.="\t".shift(@args)."\n";
131 $i-=4;
132 }
133}
134
135$code=<<___;
Kenny Rootb8494592015-09-25 02:29:14 +0000136#include <openssl/arm_arch.h>
Adam Langleyd9e397b2015-01-22 14:27:53 -0800137
138.syntax unified
139
140.text
141.code 32
142
Adam Langley4139edb2016-01-13 15:00:54 -0800143#ifdef __clang__
Adam Langleye9ada862015-05-11 17:20:37 -0700144#define ldrplb ldrbpl
145#define ldrneb ldrbne
146#endif
147
Adam Langleyd9e397b2015-01-22 14:27:53 -0800148.type rem_4bit,%object
149.align 5
150rem_4bit:
151.short 0x0000,0x1C20,0x3840,0x2460
152.short 0x7080,0x6CA0,0x48C0,0x54E0
153.short 0xE100,0xFD20,0xD940,0xC560
154.short 0x9180,0x8DA0,0xA9C0,0xB5E0
155.size rem_4bit,.-rem_4bit
156
157.type rem_4bit_get,%function
158rem_4bit_get:
159 sub $rem_4bit,pc,#8
160 sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
161 b .Lrem_4bit_got
162 nop
163.size rem_4bit_get,.-rem_4bit_get
164
165.global gcm_ghash_4bit
Adam Langleyd9e397b2015-01-22 14:27:53 -0800166.type gcm_ghash_4bit,%function
167gcm_ghash_4bit:
168 sub r12,pc,#8
169 add $len,$inp,$len @ $len to point at the end
170 stmdb sp!,{r3-r11,lr} @ save $len/end too
171 sub r12,r12,#48 @ &rem_4bit
172
173 ldmia r12,{r4-r11} @ copy rem_4bit ...
174 stmdb sp!,{r4-r11} @ ... to stack
175
176 ldrb $nlo,[$inp,#15]
177 ldrb $nhi,[$Xi,#15]
178.Louter:
179 eor $nlo,$nlo,$nhi
180 and $nhi,$nlo,#0xf0
181 and $nlo,$nlo,#0x0f
182 mov $cnt,#14
183
184 add $Zhh,$Htbl,$nlo,lsl#4
185 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
186 add $Thh,$Htbl,$nhi
187 ldrb $nlo,[$inp,#14]
188
189 and $nhi,$Zll,#0xf @ rem
190 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
191 add $nhi,$nhi,$nhi
192 eor $Zll,$Tll,$Zll,lsr#4
193 ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
194 eor $Zll,$Zll,$Zlh,lsl#28
195 ldrb $nhi,[$Xi,#14]
196 eor $Zlh,$Tlh,$Zlh,lsr#4
197 eor $Zlh,$Zlh,$Zhl,lsl#28
198 eor $Zhl,$Thl,$Zhl,lsr#4
199 eor $Zhl,$Zhl,$Zhh,lsl#28
200 eor $Zhh,$Thh,$Zhh,lsr#4
201 eor $nlo,$nlo,$nhi
202 and $nhi,$nlo,#0xf0
203 and $nlo,$nlo,#0x0f
204 eor $Zhh,$Zhh,$Tll,lsl#16
205
206.Linner:
207 add $Thh,$Htbl,$nlo,lsl#4
208 and $nlo,$Zll,#0xf @ rem
209 subs $cnt,$cnt,#1
210 add $nlo,$nlo,$nlo
211 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
212 eor $Zll,$Tll,$Zll,lsr#4
213 eor $Zll,$Zll,$Zlh,lsl#28
214 eor $Zlh,$Tlh,$Zlh,lsr#4
215 eor $Zlh,$Zlh,$Zhl,lsl#28
216 ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
217 eor $Zhl,$Thl,$Zhl,lsr#4
218 ldrbpl $nlo,[$inp,$cnt]
219 eor $Zhl,$Zhl,$Zhh,lsl#28
220 eor $Zhh,$Thh,$Zhh,lsr#4
221
222 add $Thh,$Htbl,$nhi
223 and $nhi,$Zll,#0xf @ rem
224 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
225 add $nhi,$nhi,$nhi
226 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
227 eor $Zll,$Tll,$Zll,lsr#4
228 ldrbpl $Tll,[$Xi,$cnt]
229 eor $Zll,$Zll,$Zlh,lsl#28
230 eor $Zlh,$Tlh,$Zlh,lsr#4
231 ldrh $Tlh,[sp,$nhi]
232 eor $Zlh,$Zlh,$Zhl,lsl#28
233 eor $Zhl,$Thl,$Zhl,lsr#4
234 eor $Zhl,$Zhl,$Zhh,lsl#28
235 eorpl $nlo,$nlo,$Tll
236 eor $Zhh,$Thh,$Zhh,lsr#4
237 andpl $nhi,$nlo,#0xf0
238 andpl $nlo,$nlo,#0x0f
239 eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
240 bpl .Linner
241
242 ldr $len,[sp,#32] @ re-load $len/end
243 add $inp,$inp,#16
244 mov $nhi,$Zll
245___
246 &Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]");
247$code.=<<___;
248 bne .Louter
249
250 add sp,sp,#36
251#if __ARM_ARCH__>=5
252 ldmia sp!,{r4-r11,pc}
253#else
254 ldmia sp!,{r4-r11,lr}
255 tst lr,#1
256 moveq pc,lr @ be binary compatible with V4, yet
257 bx lr @ interoperable with Thumb ISA:-)
258#endif
259.size gcm_ghash_4bit,.-gcm_ghash_4bit
260
261.global gcm_gmult_4bit
Adam Langleyd9e397b2015-01-22 14:27:53 -0800262.type gcm_gmult_4bit,%function
263gcm_gmult_4bit:
264 stmdb sp!,{r4-r11,lr}
265 ldrb $nlo,[$Xi,#15]
266 b rem_4bit_get
267.Lrem_4bit_got:
268 and $nhi,$nlo,#0xf0
269 and $nlo,$nlo,#0x0f
270 mov $cnt,#14
271
272 add $Zhh,$Htbl,$nlo,lsl#4
273 ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
274 ldrb $nlo,[$Xi,#14]
275
276 add $Thh,$Htbl,$nhi
277 and $nhi,$Zll,#0xf @ rem
278 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
279 add $nhi,$nhi,$nhi
280 eor $Zll,$Tll,$Zll,lsr#4
281 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
282 eor $Zll,$Zll,$Zlh,lsl#28
283 eor $Zlh,$Tlh,$Zlh,lsr#4
284 eor $Zlh,$Zlh,$Zhl,lsl#28
285 eor $Zhl,$Thl,$Zhl,lsr#4
286 eor $Zhl,$Zhl,$Zhh,lsl#28
287 eor $Zhh,$Thh,$Zhh,lsr#4
288 and $nhi,$nlo,#0xf0
289 eor $Zhh,$Zhh,$Tll,lsl#16
290 and $nlo,$nlo,#0x0f
291
292.Loop:
293 add $Thh,$Htbl,$nlo,lsl#4
294 and $nlo,$Zll,#0xf @ rem
295 subs $cnt,$cnt,#1
296 add $nlo,$nlo,$nlo
297 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
298 eor $Zll,$Tll,$Zll,lsr#4
299 eor $Zll,$Zll,$Zlh,lsl#28
300 eor $Zlh,$Tlh,$Zlh,lsr#4
301 eor $Zlh,$Zlh,$Zhl,lsl#28
302 ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
303 eor $Zhl,$Thl,$Zhl,lsr#4
304 ldrbpl $nlo,[$Xi,$cnt]
305 eor $Zhl,$Zhl,$Zhh,lsl#28
306 eor $Zhh,$Thh,$Zhh,lsr#4
307
308 add $Thh,$Htbl,$nhi
309 and $nhi,$Zll,#0xf @ rem
310 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
311 add $nhi,$nhi,$nhi
312 ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
313 eor $Zll,$Tll,$Zll,lsr#4
314 eor $Zll,$Zll,$Zlh,lsl#28
315 eor $Zlh,$Tlh,$Zlh,lsr#4
316 ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
317 eor $Zlh,$Zlh,$Zhl,lsl#28
318 eor $Zhl,$Thl,$Zhl,lsr#4
319 eor $Zhl,$Zhl,$Zhh,lsl#28
320 eor $Zhh,$Thh,$Zhh,lsr#4
321 andpl $nhi,$nlo,#0xf0
322 andpl $nlo,$nlo,#0x0f
323 eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
324 bpl .Loop
325___
326 &Zsmash();
327$code.=<<___;
328#if __ARM_ARCH__>=5
329 ldmia sp!,{r4-r11,pc}
330#else
331 ldmia sp!,{r4-r11,lr}
332 tst lr,#1
333 moveq pc,lr @ be binary compatible with V4, yet
334 bx lr @ interoperable with Thumb ISA:-)
335#endif
336.size gcm_gmult_4bit,.-gcm_gmult_4bit
337___
338{
339my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
340my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
341my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
342
343sub clmul64x64 {
344my ($r,$a,$b)=@_;
345$code.=<<___;
346 vext.8 $t0#lo, $a, $a, #1 @ A1
347 vmull.p8 $t0, $t0#lo, $b @ F = A1*B
348 vext.8 $r#lo, $b, $b, #1 @ B1
349 vmull.p8 $r, $a, $r#lo @ E = A*B1
350 vext.8 $t1#lo, $a, $a, #2 @ A2
351 vmull.p8 $t1, $t1#lo, $b @ H = A2*B
352 vext.8 $t3#lo, $b, $b, #2 @ B2
353 vmull.p8 $t3, $a, $t3#lo @ G = A*B2
354 vext.8 $t2#lo, $a, $a, #3 @ A3
355 veor $t0, $t0, $r @ L = E + F
356 vmull.p8 $t2, $t2#lo, $b @ J = A3*B
357 vext.8 $r#lo, $b, $b, #3 @ B3
358 veor $t1, $t1, $t3 @ M = G + H
359 vmull.p8 $r, $a, $r#lo @ I = A*B3
360 veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
361 vand $t0#hi, $t0#hi, $k48
362 vext.8 $t3#lo, $b, $b, #4 @ B4
363 veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
364 vand $t1#hi, $t1#hi, $k32
365 vmull.p8 $t3, $a, $t3#lo @ K = A*B4
366 veor $t2, $t2, $r @ N = I + J
367 veor $t0#lo, $t0#lo, $t0#hi
368 veor $t1#lo, $t1#lo, $t1#hi
369 veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
370 vand $t2#hi, $t2#hi, $k16
371 vext.8 $t0, $t0, $t0, #15
372 veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
373 vmov.i64 $t3#hi, #0
374 vext.8 $t1, $t1, $t1, #14
375 veor $t2#lo, $t2#lo, $t2#hi
376 vmull.p8 $r, $a, $b @ D = A*B
377 vext.8 $t3, $t3, $t3, #12
378 vext.8 $t2, $t2, $t2, #13
379 veor $t0, $t0, $t1
380 veor $t2, $t2, $t3
381 veor $r, $r, $t0
382 veor $r, $r, $t2
383___
384}
385
386$code.=<<___;
Adam Langleye9ada862015-05-11 17:20:37 -0700387#if __ARM_MAX_ARCH__>=7
388.arch armv7-a
Adam Langleyd9e397b2015-01-22 14:27:53 -0800389.fpu neon
390
391.global gcm_init_neon
Adam Langleyd9e397b2015-01-22 14:27:53 -0800392.type gcm_init_neon,%function
393.align 4
394gcm_init_neon:
Adam Langleye9ada862015-05-11 17:20:37 -0700395 vld1.64 $IN#hi,[r1]! @ load H
Adam Langleyd9e397b2015-01-22 14:27:53 -0800396 vmov.i8 $t0,#0xe1
Adam Langleye9ada862015-05-11 17:20:37 -0700397 vld1.64 $IN#lo,[r1]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800398 vshl.i64 $t0#hi,#57
399 vshr.u64 $t0#lo,#63 @ t0=0xc2....01
400 vdup.8 $t1,$IN#hi[7]
401 vshr.u64 $Hlo,$IN#lo,#63
402 vshr.s8 $t1,#7 @ broadcast carry bit
403 vshl.i64 $IN,$IN,#1
404 vand $t0,$t0,$t1
405 vorr $IN#hi,$Hlo @ H<<<=1
406 veor $IN,$IN,$t0 @ twisted H
407 vstmia r0,{$IN}
408
Adam Langleye9ada862015-05-11 17:20:37 -0700409 ret @ bx lr
Adam Langleyd9e397b2015-01-22 14:27:53 -0800410.size gcm_init_neon,.-gcm_init_neon
411
412.global gcm_gmult_neon
Adam Langleyd9e397b2015-01-22 14:27:53 -0800413.type gcm_gmult_neon,%function
414.align 4
415gcm_gmult_neon:
Adam Langleye9ada862015-05-11 17:20:37 -0700416 vld1.64 $IN#hi,[$Xi]! @ load Xi
417 vld1.64 $IN#lo,[$Xi]!
Adam Langleyd9e397b2015-01-22 14:27:53 -0800418 vmov.i64 $k48,#0x0000ffffffffffff
419 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
420 vmov.i64 $k32,#0x00000000ffffffff
421#ifdef __ARMEL__
422 vrev64.8 $IN,$IN
423#endif
424 vmov.i64 $k16,#0x000000000000ffff
425 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
426 mov $len,#16
427 b .Lgmult_neon
428.size gcm_gmult_neon,.-gcm_gmult_neon
429
430.global gcm_ghash_neon
Adam Langleyd9e397b2015-01-22 14:27:53 -0800431.type gcm_ghash_neon,%function
432.align 4
433gcm_ghash_neon:
Adam Langleye9ada862015-05-11 17:20:37 -0700434 vld1.64 $Xl#hi,[$Xi]! @ load Xi
435 vld1.64 $Xl#lo,[$Xi]!
Adam Langleyd9e397b2015-01-22 14:27:53 -0800436 vmov.i64 $k48,#0x0000ffffffffffff
437 vldmia $Htbl,{$Hlo-$Hhi} @ load twisted H
438 vmov.i64 $k32,#0x00000000ffffffff
439#ifdef __ARMEL__
440 vrev64.8 $Xl,$Xl
441#endif
442 vmov.i64 $k16,#0x000000000000ffff
443 veor $Hhl,$Hlo,$Hhi @ Karatsuba pre-processing
444
445.Loop_neon:
446 vld1.64 $IN#hi,[$inp]! @ load inp
447 vld1.64 $IN#lo,[$inp]!
448#ifdef __ARMEL__
449 vrev64.8 $IN,$IN
450#endif
451 veor $IN,$Xl @ inp^=Xi
452.Lgmult_neon:
453___
Kenny Rootb8494592015-09-25 02:29:14 +0000454 &clmul64x64 ($Xl,$Hlo,"$IN#lo"); # H.lo·Xi.lo
Adam Langleyd9e397b2015-01-22 14:27:53 -0800455$code.=<<___;
456 veor $IN#lo,$IN#lo,$IN#hi @ Karatsuba pre-processing
457___
Kenny Rootb8494592015-09-25 02:29:14 +0000458 &clmul64x64 ($Xm,$Hhl,"$IN#lo"); # (H.lo+H.hi)·(Xi.lo+Xi.hi)
459 &clmul64x64 ($Xh,$Hhi,"$IN#hi"); # H.hi·Xi.hi
Adam Langleyd9e397b2015-01-22 14:27:53 -0800460$code.=<<___;
461 veor $Xm,$Xm,$Xl @ Karatsuba post-processing
462 veor $Xm,$Xm,$Xh
463 veor $Xl#hi,$Xl#hi,$Xm#lo
464 veor $Xh#lo,$Xh#lo,$Xm#hi @ Xh|Xl - 256-bit result
465
466 @ equivalent of reduction_avx from ghash-x86_64.pl
467 vshl.i64 $t1,$Xl,#57 @ 1st phase
468 vshl.i64 $t2,$Xl,#62
469 veor $t2,$t2,$t1 @
470 vshl.i64 $t1,$Xl,#63
471 veor $t2, $t2, $t1 @
472 veor $Xl#hi,$Xl#hi,$t2#lo @
473 veor $Xh#lo,$Xh#lo,$t2#hi
474
475 vshr.u64 $t2,$Xl,#1 @ 2nd phase
476 veor $Xh,$Xh,$Xl
477 veor $Xl,$Xl,$t2 @
478 vshr.u64 $t2,$t2,#6
479 vshr.u64 $Xl,$Xl,#1 @
480 veor $Xl,$Xl,$Xh @
481 veor $Xl,$Xl,$t2 @
482
483 subs $len,#16
484 bne .Loop_neon
485
486#ifdef __ARMEL__
487 vrev64.8 $Xl,$Xl
488#endif
Robert Sloana94fe052017-02-21 08:49:28 -0800489 sub $Xi,#16
Adam Langleye9ada862015-05-11 17:20:37 -0700490 vst1.64 $Xl#hi,[$Xi]! @ write out Xi
491 vst1.64 $Xl#lo,[$Xi]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800492
Adam Langleye9ada862015-05-11 17:20:37 -0700493 ret @ bx lr
Adam Langleyd9e397b2015-01-22 14:27:53 -0800494.size gcm_ghash_neon,.-gcm_ghash_neon
495#endif
496___
497}
498$code.=<<___;
499.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
500.align 2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800501___
502
503foreach (split("\n",$code)) {
504 s/\`([^\`]*)\`/eval $1/geo;
505
506 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
Adam Langleye9ada862015-05-11 17:20:37 -0700507 s/\bret\b/bx lr/go or
Adam Langleyd9e397b2015-01-22 14:27:53 -0800508 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
509
510 print $_,"\n";
511}
512close STDOUT; # enforce flush