blob: c46cdb5d7e92e30dd9b7c18a5ffc9acba3fbf1bc [file] [log] [blame]
Steven Valdezbb1ceac2016-10-07 10:34:51 -04001#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License"). You may not use
5# this file except in compliance with the License. You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# GHASH for for PowerISA v2.07.
18#
19# July 2014
20#
21# Accurate performance measurements are problematic, because it's
22# always virtualized setup with possibly throttled processor.
23# Relative comparison is therefore more informative. This initial
24# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
25# faster than "4-bit" integer-only compiler-generated 64-bit code.
26# "Initial version" means that there is room for futher improvement.
27
28# May 2016
29#
30# 2x aggregated reduction improves performance by 50% (resulting
31# performance on POWER8 is 1 cycle per processed byte), and 4x
32# aggregated reduction - by 170% or 2.7x (resulting in 0.55 cpb).
33
34$flavour=shift;
35$output =shift;
36
37if ($flavour =~ /64/) {
38 $SIZE_T=8;
39 $LRSAVE=2*$SIZE_T;
40 $STU="stdu";
41 $POP="ld";
42 $PUSH="std";
43 $UCMP="cmpld";
44 $SHRI="srdi";
45} elsif ($flavour =~ /32/) {
46 $SIZE_T=4;
47 $LRSAVE=$SIZE_T;
48 $STU="stwu";
49 $POP="lwz";
50 $PUSH="stw";
51 $UCMP="cmplw";
52 $SHRI="srwi";
53} else { die "nonsense $flavour"; }
54
55$sp="r1";
56$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
57
58$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
Robert Sloan9254e682017-04-24 09:42:06 -070060( $xlate="${dir}../../../perlasm/ppc-xlate.pl" and -f $xlate) or
Steven Valdezbb1ceac2016-10-07 10:34:51 -040061die "can't locate ppc-xlate.pl";
62
63open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
64
65my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
66
67my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
68my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
69my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
70my $vrsave="r12";
71
72$code=<<___;
73.machine "any"
74
75.text
76
77.globl .gcm_init_p8
78.align 5
79.gcm_init_p8:
80 li r0,-4096
81 li r8,0x10
82 mfspr $vrsave,256
83 li r9,0x20
84 mtspr 256,r0
85 li r10,0x30
86 lvx_u $H,0,r4 # load H
87
88 vspltisb $xC2,-16 # 0xf0
89 vspltisb $t0,1 # one
90 vaddubm $xC2,$xC2,$xC2 # 0xe0
91 vxor $zero,$zero,$zero
92 vor $xC2,$xC2,$t0 # 0xe1
93 vsldoi $xC2,$xC2,$zero,15 # 0xe1...
94 vsldoi $t1,$zero,$t0,1 # ...1
95 vaddubm $xC2,$xC2,$xC2 # 0xc2...
96 vspltisb $t2,7
97 vor $xC2,$xC2,$t1 # 0xc2....01
98 vspltb $t1,$H,0 # most significant byte
99 vsl $H,$H,$t0 # H<<=1
100 vsrab $t1,$t1,$t2 # broadcast carry bit
101 vand $t1,$t1,$xC2
102 vxor $IN,$H,$t1 # twisted H
103
104 vsldoi $H,$IN,$IN,8 # twist even more ...
105 vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
106 vsldoi $Hl,$zero,$H,8 # ... and split
107 vsldoi $Hh,$H,$zero,8
108
109 stvx_u $xC2,0,r3 # save pre-computed table
110 stvx_u $Hl,r8,r3
111 li r8,0x40
112 stvx_u $H, r9,r3
113 li r9,0x50
114 stvx_u $Hh,r10,r3
115 li r10,0x60
116
117 vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
118 vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
119 vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
120
121 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
122
123 vsldoi $t0,$Xm,$zero,8
124 vsldoi $t1,$zero,$Xm,8
125 vxor $Xl,$Xl,$t0
126 vxor $Xh,$Xh,$t1
127
128 vsldoi $Xl,$Xl,$Xl,8
129 vxor $Xl,$Xl,$t2
130
131 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
132 vpmsumd $Xl,$Xl,$xC2
133 vxor $t1,$t1,$Xh
134 vxor $IN1,$Xl,$t1
135
136 vsldoi $H2,$IN1,$IN1,8
137 vsldoi $H2l,$zero,$H2,8
138 vsldoi $H2h,$H2,$zero,8
139
140 stvx_u $H2l,r8,r3 # save H^2
141 li r8,0x70
142 stvx_u $H2,r9,r3
143 li r9,0x80
144 stvx_u $H2h,r10,r3
145 li r10,0x90
146___
147{
148my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
149$code.=<<___;
150 vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
151 vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
152 vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
153 vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
154 vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
155 vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
156
157 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
158 vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
159
160 vsldoi $t0,$Xm,$zero,8
161 vsldoi $t1,$zero,$Xm,8
162 vsldoi $t4,$Xm1,$zero,8
163 vsldoi $t5,$zero,$Xm1,8
164 vxor $Xl,$Xl,$t0
165 vxor $Xh,$Xh,$t1
166 vxor $Xl1,$Xl1,$t4
167 vxor $Xh1,$Xh1,$t5
168
169 vsldoi $Xl,$Xl,$Xl,8
170 vsldoi $Xl1,$Xl1,$Xl1,8
171 vxor $Xl,$Xl,$t2
172 vxor $Xl1,$Xl1,$t6
173
174 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
175 vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
176 vpmsumd $Xl,$Xl,$xC2
177 vpmsumd $Xl1,$Xl1,$xC2
178 vxor $t1,$t1,$Xh
179 vxor $t5,$t5,$Xh1
180 vxor $Xl,$Xl,$t1
181 vxor $Xl1,$Xl1,$t5
182
183 vsldoi $H,$Xl,$Xl,8
184 vsldoi $H2,$Xl1,$Xl1,8
185 vsldoi $Hl,$zero,$H,8
186 vsldoi $Hh,$H,$zero,8
187 vsldoi $H2l,$zero,$H2,8
188 vsldoi $H2h,$H2,$zero,8
189
190 stvx_u $Hl,r8,r3 # save H^3
191 li r8,0xa0
192 stvx_u $H,r9,r3
193 li r9,0xb0
194 stvx_u $Hh,r10,r3
195 li r10,0xc0
196 stvx_u $H2l,r8,r3 # save H^4
197 stvx_u $H2,r9,r3
198 stvx_u $H2h,r10,r3
199
200 mtspr 256,$vrsave
201 blr
202 .long 0
203 .byte 0,12,0x14,0,0,0,2,0
204 .long 0
205.size .gcm_init_p8,.-.gcm_init_p8
206___
207}
208$code.=<<___;
209.globl .gcm_gmult_p8
210.align 5
211.gcm_gmult_p8:
212 lis r0,0xfff8
213 li r8,0x10
214 mfspr $vrsave,256
215 li r9,0x20
216 mtspr 256,r0
217 li r10,0x30
218 lvx_u $IN,0,$Xip # load Xi
219
220 lvx_u $Hl,r8,$Htbl # load pre-computed table
221 le?lvsl $lemask,r0,r0
222 lvx_u $H, r9,$Htbl
223 le?vspltisb $t0,0x07
224 lvx_u $Hh,r10,$Htbl
225 le?vxor $lemask,$lemask,$t0
226 lvx_u $xC2,0,$Htbl
227 le?vperm $IN,$IN,$IN,$lemask
228 vxor $zero,$zero,$zero
229
230 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
231 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
232 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
233
234 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
235
236 vsldoi $t0,$Xm,$zero,8
237 vsldoi $t1,$zero,$Xm,8
238 vxor $Xl,$Xl,$t0
239 vxor $Xh,$Xh,$t1
240
241 vsldoi $Xl,$Xl,$Xl,8
242 vxor $Xl,$Xl,$t2
243
244 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
245 vpmsumd $Xl,$Xl,$xC2
246 vxor $t1,$t1,$Xh
247 vxor $Xl,$Xl,$t1
248
249 le?vperm $Xl,$Xl,$Xl,$lemask
250 stvx_u $Xl,0,$Xip # write out Xi
251
252 mtspr 256,$vrsave
253 blr
254 .long 0
255 .byte 0,12,0x14,0,0,0,2,0
256 .long 0
257.size .gcm_gmult_p8,.-.gcm_gmult_p8
258
259.globl .gcm_ghash_p8
260.align 5
261.gcm_ghash_p8:
262 li r0,-4096
263 li r8,0x10
264 mfspr $vrsave,256
265 li r9,0x20
266 mtspr 256,r0
267 li r10,0x30
268 lvx_u $Xl,0,$Xip # load Xi
269
270 lvx_u $Hl,r8,$Htbl # load pre-computed table
271 li r8,0x40
272 le?lvsl $lemask,r0,r0
273 lvx_u $H, r9,$Htbl
274 li r9,0x50
275 le?vspltisb $t0,0x07
276 lvx_u $Hh,r10,$Htbl
277 li r10,0x60
278 le?vxor $lemask,$lemask,$t0
279 lvx_u $xC2,0,$Htbl
280 le?vperm $Xl,$Xl,$Xl,$lemask
281 vxor $zero,$zero,$zero
282
283 ${UCMP}i $len,64
284 bge Lgcm_ghash_p8_4x
285
286 lvx_u $IN,0,$inp
287 addi $inp,$inp,16
288 subic. $len,$len,16
289 le?vperm $IN,$IN,$IN,$lemask
290 vxor $IN,$IN,$Xl
291 beq Lshort
292
293 lvx_u $H2l,r8,$Htbl # load H^2
294 li r8,16
295 lvx_u $H2, r9,$Htbl
296 add r9,$inp,$len # end of input
297 lvx_u $H2h,r10,$Htbl
298 be?b Loop_2x
299
300.align 5
301Loop_2x:
302 lvx_u $IN1,0,$inp
303 le?vperm $IN1,$IN1,$IN1,$lemask
304
305 subic $len,$len,32
306 vpmsumd $Xl,$IN,$H2l # H^2.lo·Xi.lo
307 vpmsumd $Xl1,$IN1,$Hl # H.lo·Xi+1.lo
308 subfe r0,r0,r0 # borrow?-1:0
309 vpmsumd $Xm,$IN,$H2 # H^2.hi·Xi.lo+H^2.lo·Xi.hi
310 vpmsumd $Xm1,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+1.hi
311 and r0,r0,$len
312 vpmsumd $Xh,$IN,$H2h # H^2.hi·Xi.hi
313 vpmsumd $Xh1,$IN1,$Hh # H.hi·Xi+1.hi
314 add $inp,$inp,r0
315
316 vxor $Xl,$Xl,$Xl1
317 vxor $Xm,$Xm,$Xm1
318
319 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
320
321 vsldoi $t0,$Xm,$zero,8
322 vsldoi $t1,$zero,$Xm,8
323 vxor $Xh,$Xh,$Xh1
324 vxor $Xl,$Xl,$t0
325 vxor $Xh,$Xh,$t1
326
327 vsldoi $Xl,$Xl,$Xl,8
328 vxor $Xl,$Xl,$t2
329 lvx_u $IN,r8,$inp
330 addi $inp,$inp,32
331
332 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
333 vpmsumd $Xl,$Xl,$xC2
334 le?vperm $IN,$IN,$IN,$lemask
335 vxor $t1,$t1,$Xh
336 vxor $IN,$IN,$t1
337 vxor $IN,$IN,$Xl
338 $UCMP r9,$inp
339 bgt Loop_2x # done yet?
340
341 cmplwi $len,0
342 bne Leven
343
344Lshort:
345 vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
346 vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
347 vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
348
349 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
350
351 vsldoi $t0,$Xm,$zero,8
352 vsldoi $t1,$zero,$Xm,8
353 vxor $Xl,$Xl,$t0
354 vxor $Xh,$Xh,$t1
355
356 vsldoi $Xl,$Xl,$Xl,8
357 vxor $Xl,$Xl,$t2
358
359 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
360 vpmsumd $Xl,$Xl,$xC2
361 vxor $t1,$t1,$Xh
362
363Leven:
364 vxor $Xl,$Xl,$t1
365 le?vperm $Xl,$Xl,$Xl,$lemask
366 stvx_u $Xl,0,$Xip # write out Xi
367
368 mtspr 256,$vrsave
369 blr
370 .long 0
371 .byte 0,12,0x14,0,0,0,4,0
372 .long 0
373___
374{
375my ($Xl3,$Xm2,$IN2,$H3l,$H3,$H3h,
376 $Xh3,$Xm3,$IN3,$H4l,$H4,$H4h) = map("v$_",(20..31));
377my $IN0=$IN;
378my ($H21l,$H21h,$loperm,$hiperm) = ($Hl,$Hh,$H2l,$H2h);
379
380$code.=<<___;
381.align 5
382.gcm_ghash_p8_4x:
383Lgcm_ghash_p8_4x:
384 $STU $sp,-$FRAME($sp)
385 li r10,`15+6*$SIZE_T`
386 li r11,`31+6*$SIZE_T`
387 stvx v20,r10,$sp
388 addi r10,r10,32
389 stvx v21,r11,$sp
390 addi r11,r11,32
391 stvx v22,r10,$sp
392 addi r10,r10,32
393 stvx v23,r11,$sp
394 addi r11,r11,32
395 stvx v24,r10,$sp
396 addi r10,r10,32
397 stvx v25,r11,$sp
398 addi r11,r11,32
399 stvx v26,r10,$sp
400 addi r10,r10,32
401 stvx v27,r11,$sp
402 addi r11,r11,32
403 stvx v28,r10,$sp
404 addi r10,r10,32
405 stvx v29,r11,$sp
406 addi r11,r11,32
407 stvx v30,r10,$sp
408 li r10,0x60
409 stvx v31,r11,$sp
410 li r0,-1
411 stw $vrsave,`$FRAME-4`($sp) # save vrsave
412 mtspr 256,r0 # preserve all AltiVec registers
413
414 lvsl $t0,0,r8 # 0x0001..0e0f
415 #lvx_u $H2l,r8,$Htbl # load H^2
416 li r8,0x70
417 lvx_u $H2, r9,$Htbl
418 li r9,0x80
419 vspltisb $t1,8 # 0x0808..0808
420 #lvx_u $H2h,r10,$Htbl
421 li r10,0x90
422 lvx_u $H3l,r8,$Htbl # load H^3
423 li r8,0xa0
424 lvx_u $H3, r9,$Htbl
425 li r9,0xb0
426 lvx_u $H3h,r10,$Htbl
427 li r10,0xc0
428 lvx_u $H4l,r8,$Htbl # load H^4
429 li r8,0x10
430 lvx_u $H4, r9,$Htbl
431 li r9,0x20
432 lvx_u $H4h,r10,$Htbl
433 li r10,0x30
434
435 vsldoi $t2,$zero,$t1,8 # 0x0000..0808
436 vaddubm $hiperm,$t0,$t2 # 0x0001..1617
437 vaddubm $loperm,$t1,$hiperm # 0x0809..1e1f
438
439 $SHRI $len,$len,4 # this allows to use sign bit
440 # as carry
441 lvx_u $IN0,0,$inp # load input
442 lvx_u $IN1,r8,$inp
443 subic. $len,$len,8
444 lvx_u $IN2,r9,$inp
445 lvx_u $IN3,r10,$inp
446 addi $inp,$inp,0x40
447 le?vperm $IN0,$IN0,$IN0,$lemask
448 le?vperm $IN1,$IN1,$IN1,$lemask
449 le?vperm $IN2,$IN2,$IN2,$lemask
450 le?vperm $IN3,$IN3,$IN3,$lemask
451
452 vxor $Xh,$IN0,$Xl
453
454 vpmsumd $Xl1,$IN1,$H3l
455 vpmsumd $Xm1,$IN1,$H3
456 vpmsumd $Xh1,$IN1,$H3h
457
458 vperm $H21l,$H2,$H,$hiperm
459 vperm $t0,$IN2,$IN3,$loperm
460 vperm $H21h,$H2,$H,$loperm
461 vperm $t1,$IN2,$IN3,$hiperm
462 vpmsumd $Xm2,$IN2,$H2 # H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
463 vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
464 vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
465 vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
466
467 vxor $Xm2,$Xm2,$Xm1
468 vxor $Xl3,$Xl3,$Xl1
469 vxor $Xm3,$Xm3,$Xm2
470 vxor $Xh3,$Xh3,$Xh1
471
472 blt Ltail_4x
473
474Loop_4x:
475 lvx_u $IN0,0,$inp
476 lvx_u $IN1,r8,$inp
477 subic. $len,$len,4
478 lvx_u $IN2,r9,$inp
479 lvx_u $IN3,r10,$inp
480 addi $inp,$inp,0x40
481 le?vperm $IN1,$IN1,$IN1,$lemask
482 le?vperm $IN2,$IN2,$IN2,$lemask
483 le?vperm $IN3,$IN3,$IN3,$lemask
484 le?vperm $IN0,$IN0,$IN0,$lemask
485
486 vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
487 vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
488 vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
489 vpmsumd $Xl1,$IN1,$H3l
490 vpmsumd $Xm1,$IN1,$H3
491 vpmsumd $Xh1,$IN1,$H3h
492
493 vxor $Xl,$Xl,$Xl3
494 vxor $Xm,$Xm,$Xm3
495 vxor $Xh,$Xh,$Xh3
496 vperm $t0,$IN2,$IN3,$loperm
497 vperm $t1,$IN2,$IN3,$hiperm
498
499 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
500 vpmsumd $Xl3,$t0,$H21l # H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
501 vpmsumd $Xh3,$t1,$H21h # H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
502
503 vsldoi $t0,$Xm,$zero,8
504 vsldoi $t1,$zero,$Xm,8
505 vxor $Xl,$Xl,$t0
506 vxor $Xh,$Xh,$t1
507
508 vsldoi $Xl,$Xl,$Xl,8
509 vxor $Xl,$Xl,$t2
510
511 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
512 vpmsumd $Xm2,$IN2,$H2 # H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
513 vpmsumd $Xm3,$IN3,$H # H.hi·Xi+3.lo +H.lo·Xi+3.hi
514 vpmsumd $Xl,$Xl,$xC2
515
516 vxor $Xl3,$Xl3,$Xl1
517 vxor $Xh3,$Xh3,$Xh1
518 vxor $Xh,$Xh,$IN0
519 vxor $Xm2,$Xm2,$Xm1
520 vxor $Xh,$Xh,$t1
521 vxor $Xm3,$Xm3,$Xm2
522 vxor $Xh,$Xh,$Xl
523 bge Loop_4x
524
525Ltail_4x:
526 vpmsumd $Xl,$Xh,$H4l # H^4.lo·Xi.lo
527 vpmsumd $Xm,$Xh,$H4 # H^4.hi·Xi.lo+H^4.lo·Xi.hi
528 vpmsumd $Xh,$Xh,$H4h # H^4.hi·Xi.hi
529
530 vxor $Xl,$Xl,$Xl3
531 vxor $Xm,$Xm,$Xm3
532
533 vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
534
535 vsldoi $t0,$Xm,$zero,8
536 vsldoi $t1,$zero,$Xm,8
537 vxor $Xh,$Xh,$Xh3
538 vxor $Xl,$Xl,$t0
539 vxor $Xh,$Xh,$t1
540
541 vsldoi $Xl,$Xl,$Xl,8
542 vxor $Xl,$Xl,$t2
543
544 vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
545 vpmsumd $Xl,$Xl,$xC2
546 vxor $t1,$t1,$Xh
547 vxor $Xl,$Xl,$t1
548
549 addic. $len,$len,4
550 beq Ldone_4x
551
552 lvx_u $IN0,0,$inp
553 ${UCMP}i $len,2
554 li $len,-4
555 blt Lone
556 lvx_u $IN1,r8,$inp
557 beq Ltwo
558
559Lthree:
560 lvx_u $IN2,r9,$inp
561 le?vperm $IN0,$IN0,$IN0,$lemask
562 le?vperm $IN1,$IN1,$IN1,$lemask
563 le?vperm $IN2,$IN2,$IN2,$lemask
564
565 vxor $Xh,$IN0,$Xl
566 vmr $H4l,$H3l
567 vmr $H4, $H3
568 vmr $H4h,$H3h
569
570 vperm $t0,$IN1,$IN2,$loperm
571 vperm $t1,$IN1,$IN2,$hiperm
572 vpmsumd $Xm2,$IN1,$H2 # H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
573 vpmsumd $Xm3,$IN2,$H # H.hi·Xi+2.lo +H.lo·Xi+2.hi
574 vpmsumd $Xl3,$t0,$H21l # H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
575 vpmsumd $Xh3,$t1,$H21h # H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
576
577 vxor $Xm3,$Xm3,$Xm2
578 b Ltail_4x
579
580.align 4
581Ltwo:
582 le?vperm $IN0,$IN0,$IN0,$lemask
583 le?vperm $IN1,$IN1,$IN1,$lemask
584
585 vxor $Xh,$IN0,$Xl
586 vperm $t0,$zero,$IN1,$loperm
587 vperm $t1,$zero,$IN1,$hiperm
588
589 vsldoi $H4l,$zero,$H2,8
590 vmr $H4, $H2
591 vsldoi $H4h,$H2,$zero,8
592
593 vpmsumd $Xl3,$t0, $H21l # H.lo·Xi+1.lo
594 vpmsumd $Xm3,$IN1,$H # H.hi·Xi+1.lo+H.lo·Xi+2.hi
595 vpmsumd $Xh3,$t1, $H21h # H.hi·Xi+1.hi
596
597 b Ltail_4x
598
599.align 4
600Lone:
601 le?vperm $IN0,$IN0,$IN0,$lemask
602
603 vsldoi $H4l,$zero,$H,8
604 vmr $H4, $H
605 vsldoi $H4h,$H,$zero,8
606
607 vxor $Xh,$IN0,$Xl
608 vxor $Xl3,$Xl3,$Xl3
609 vxor $Xm3,$Xm3,$Xm3
610 vxor $Xh3,$Xh3,$Xh3
611
612 b Ltail_4x
613
614Ldone_4x:
615 le?vperm $Xl,$Xl,$Xl,$lemask
616 stvx_u $Xl,0,$Xip # write out Xi
617
618 li r10,`15+6*$SIZE_T`
619 li r11,`31+6*$SIZE_T`
620 mtspr 256,$vrsave
621 lvx v20,r10,$sp
622 addi r10,r10,32
623 lvx v21,r11,$sp
624 addi r11,r11,32
625 lvx v22,r10,$sp
626 addi r10,r10,32
627 lvx v23,r11,$sp
628 addi r11,r11,32
629 lvx v24,r10,$sp
630 addi r10,r10,32
631 lvx v25,r11,$sp
632 addi r11,r11,32
633 lvx v26,r10,$sp
634 addi r10,r10,32
635 lvx v27,r11,$sp
636 addi r11,r11,32
637 lvx v28,r10,$sp
638 addi r10,r10,32
639 lvx v29,r11,$sp
640 addi r11,r11,32
641 lvx v30,r10,$sp
642 lvx v31,r11,$sp
643 addi $sp,$sp,$FRAME
644 blr
645 .long 0
646 .byte 0,12,0x04,0,0x80,0,4,0
647 .long 0
648___
649}
650$code.=<<___;
651.size .gcm_ghash_p8,.-.gcm_ghash_p8
652
653.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
654.align 2
655___
656
657foreach (split("\n",$code)) {
658 s/\`([^\`]*)\`/eval $1/geo;
659
660 if ($flavour =~ /le$/o) { # little-endian
661 s/le\?//o or
662 s/be\?/#be#/o;
663 } else {
664 s/le\?/#le#/o or
665 s/be\?//o;
666 }
667 print $_,"\n";
668}
669
Srinivas Paladugudd42a612019-08-09 19:30:39 +0000670close STDOUT; # enforce flush