blob: 37c65d35111baddd9a651bd44ccd8329b5117dab [file] [log] [blame]
Kinson Chika8fa74c2011-07-29 11:33:41 -07001#!/usr/bin/env perl
2#
3# Implemented as a Perl wrapper as we want to support several different
4# architectures with single file. We pick up the target based on the
5# file name we are asked to generate.
6#
7# It should be noted though that this perl code is nothing like
8# <openssl>/crypto/perlasm/x86*. In this case perl is used pretty much
9# as pre-processor to cover for platform differences in name decoration,
10# linker tables, 32-/64-bit instruction sets...
11#
12# As you might know there're several PowerPC ABI in use. Most notably
13# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs
14# are similar enough to implement leaf(!) functions, which would be ABI
15# neutral. And that's what you find here: ABI neutral leaf functions.
16# In case you wonder what that is...
17#
18# AIX performance
19#
20# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e.
21#
22# The following is the performance of 32-bit compiler
23# generated code:
24#
25# OpenSSL 0.9.6c 21 dec 2001
26# built on: Tue Jun 11 11:06:51 EDT 2002
27# options:bn(64,32) ...
28#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3
29# sign verify sign/s verify/s
30#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6
31#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5
32#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1
33#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4
34#dsa 512 bits 0.0087s 0.0106s 114.3 94.5
35#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0
36#
37# Same bechmark with this assembler code:
38#
39#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2
40#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1
41#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2
42#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7
43#dsa 512 bits 0.0052s 0.0062s 191.6 162.0
44#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5
45#
46# Number of operations increases by at almost 75%
47#
48# Here are performance numbers for 64-bit compiler
49# generated code:
50#
51# OpenSSL 0.9.6g [engine] 9 Aug 2002
52# built on: Fri Apr 18 16:59:20 EDT 2003
53# options:bn(64,64) ...
54# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3
55# sign verify sign/s verify/s
56#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4
57#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7
58#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0
59#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1
60#dsa 512 bits 0.0026s 0.0032s 382.5 313.7
61#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6
62#
63# Same benchmark with this assembler code:
64#
65#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7
66#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3
67#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5
68#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0
69#dsa 512 bits 0.0016s 0.0020s 610.7 507.1
70#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2
71#
72# Again, performance increases by at about 75%
73#
74# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code)
75# OpenSSL 0.9.7c 30 Sep 2003
76#
77# Original code.
78#
79#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5
80#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1
81#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4
82#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4
83#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5
84#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7
85#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6
86#
87# Same benchmark with this assembler code:
88#
89#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9
90#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6
91#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5
92#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6
93#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2
94#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2
95#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8
96#
97# Performance increase of ~60%
98#
99# If you have comments or suggestions to improve code send
100# me a note at schari@us.ibm.com
101#
102
103$flavour = shift;
104
105if ($flavour =~ /32/) {
106 $BITS= 32;
107 $BNSZ= $BITS/8;
108 $ISA= "\"ppc\"";
109
110 $LD= "lwz"; # load
111 $LDU= "lwzu"; # load and update
112 $ST= "stw"; # store
113 $STU= "stwu"; # store and update
114 $UMULL= "mullw"; # unsigned multiply low
115 $UMULH= "mulhwu"; # unsigned multiply high
116 $UDIV= "divwu"; # unsigned divide
117 $UCMPI= "cmplwi"; # unsigned compare with immediate
118 $UCMP= "cmplw"; # unsigned compare
119 $CNTLZ= "cntlzw"; # count leading zeros
120 $SHL= "slw"; # shift left
121 $SHR= "srw"; # unsigned shift right
122 $SHRI= "srwi"; # unsigned shift right by immediate
123 $SHLI= "slwi"; # shift left by immediate
124 $CLRU= "clrlwi"; # clear upper bits
125 $INSR= "insrwi"; # insert right
126 $ROTL= "rotlwi"; # rotate left by immediate
127 $TR= "tw"; # conditional trap
128} elsif ($flavour =~ /64/) {
129 $BITS= 64;
130 $BNSZ= $BITS/8;
131 $ISA= "\"ppc64\"";
132
133 # same as above, but 64-bit mnemonics...
134 $LD= "ld"; # load
135 $LDU= "ldu"; # load and update
136 $ST= "std"; # store
137 $STU= "stdu"; # store and update
138 $UMULL= "mulld"; # unsigned multiply low
139 $UMULH= "mulhdu"; # unsigned multiply high
140 $UDIV= "divdu"; # unsigned divide
141 $UCMPI= "cmpldi"; # unsigned compare with immediate
142 $UCMP= "cmpld"; # unsigned compare
143 $CNTLZ= "cntlzd"; # count leading zeros
144 $SHL= "sld"; # shift left
145 $SHR= "srd"; # unsigned shift right
146 $SHRI= "srdi"; # unsigned shift right by immediate
147 $SHLI= "sldi"; # shift left by immediate
148 $CLRU= "clrldi"; # clear upper bits
149 $INSR= "insrdi"; # insert right
150 $ROTL= "rotldi"; # rotate left by immediate
151 $TR= "td"; # conditional trap
152} else { die "nonsense $flavour"; }
153
154$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
155( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
156( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
157die "can't locate ppc-xlate.pl";
158
159open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
160
161$data=<<EOF;
162#--------------------------------------------------------------------
163#
164#
165#
166#
167# File: ppc32.s
168#
169# Created by: Suresh Chari
170# IBM Thomas J. Watson Research Library
171# Hawthorne, NY
172#
173#
174# Description: Optimized assembly routines for OpenSSL crypto
175# on the 32 bitPowerPC platform.
176#
177#
178# Version History
179#
180# 2. Fixed bn_add,bn_sub and bn_div_words, added comments,
181# cleaned up code. Also made a single version which can
182# be used for both the AIX and Linux compilers. See NOTE
183# below.
184# 12/05/03 Suresh Chari
185# (with lots of help from) Andy Polyakov
186##
187# 1. Initial version 10/20/02 Suresh Chari
188#
189#
190# The following file works for the xlc,cc
191# and gcc compilers.
192#
193# NOTE: To get the file to link correctly with the gcc compiler
194# you have to change the names of the routines and remove
195# the first .(dot) character. This should automatically
196# be done in the build process.
197#
198# Hand optimized assembly code for the following routines
199#
200# bn_sqr_comba4
201# bn_sqr_comba8
202# bn_mul_comba4
203# bn_mul_comba8
204# bn_sub_words
205# bn_add_words
206# bn_div_words
207# bn_sqr_words
208# bn_mul_words
209# bn_mul_add_words
210#
211# NOTE: It is possible to optimize this code more for
212# specific PowerPC or Power architectures. On the Northstar
213# architecture the optimizations in this file do
214# NOT provide much improvement.
215#
216# If you have comments or suggestions to improve code send
217# me a note at schari\@us.ibm.com
218#
219#--------------------------------------------------------------------------
220#
221# Defines to be used in the assembly code.
222#
223#.set r0,0 # we use it as storage for value of 0
224#.set SP,1 # preserved
225#.set RTOC,2 # preserved
226#.set r3,3 # 1st argument/return value
227#.set r4,4 # 2nd argument/volatile register
228#.set r5,5 # 3rd argument/volatile register
229#.set r6,6 # ...
230#.set r7,7
231#.set r8,8
232#.set r9,9
233#.set r10,10
234#.set r11,11
235#.set r12,12
236#.set r13,13 # not used, nor any other "below" it...
237
238# Declare function names to be global
239# NOTE: For gcc these names MUST be changed to remove
240# the first . i.e. for example change ".bn_sqr_comba4"
241# to "bn_sqr_comba4". This should be automatically done
242# in the build.
243
244 .globl .bn_sqr_comba4
245 .globl .bn_sqr_comba8
246 .globl .bn_mul_comba4
247 .globl .bn_mul_comba8
248 .globl .bn_sub_words
249 .globl .bn_add_words
250 .globl .bn_div_words
251 .globl .bn_sqr_words
252 .globl .bn_mul_words
253 .globl .bn_mul_add_words
254
255# .text section
256
257 .machine "any"
258
259#
260# NOTE: The following label name should be changed to
261# "bn_sqr_comba4" i.e. remove the first dot
262# for the gcc compiler. This should be automatically
263# done in the build
264#
265
266.align 4
267.bn_sqr_comba4:
268#
269# Optimized version of bn_sqr_comba4.
270#
271# void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
272# r3 contains r
273# r4 contains a
274#
275# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
276#
277# r5,r6 are the two BN_ULONGs being multiplied.
278# r7,r8 are the results of the 32x32 giving 64 bit multiply.
279# r9,r10, r11 are the equivalents of c1,c2, c3.
280# Here's the assembly
281#
282#
283 xor r0,r0,r0 # set r0 = 0. Used in the addze
284 # instructions below
285
286 #sqr_add_c(a,0,c1,c2,c3)
287 $LD r5,`0*$BNSZ`(r4)
288 $UMULL r9,r5,r5
289 $UMULH r10,r5,r5 #in first iteration. No need
290 #to add since c1=c2=c3=0.
291 # Note c3(r11) is NOT set to 0
292 # but will be.
293
294 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
295 # sqr_add_c2(a,1,0,c2,c3,c1);
296 $LD r6,`1*$BNSZ`(r4)
297 $UMULL r7,r5,r6
298 $UMULH r8,r5,r6
299
300 addc r7,r7,r7 # compute (r7,r8)=2*(r7,r8)
301 adde r8,r8,r8
302 addze r9,r0 # catch carry if any.
303 # r9= r0(=0) and carry
304
305 addc r10,r7,r10 # now add to temp result.
306 addze r11,r8 # r8 added to r11 which is 0
307 addze r9,r9
308
309 $ST r10,`1*$BNSZ`(r3) #r[1]=c2;
310 #sqr_add_c(a,1,c3,c1,c2)
311 $UMULL r7,r6,r6
312 $UMULH r8,r6,r6
313 addc r11,r7,r11
314 adde r9,r8,r9
315 addze r10,r0
316 #sqr_add_c2(a,2,0,c3,c1,c2)
317 $LD r6,`2*$BNSZ`(r4)
318 $UMULL r7,r5,r6
319 $UMULH r8,r5,r6
320
321 addc r7,r7,r7
322 adde r8,r8,r8
323 addze r10,r10
324
325 addc r11,r7,r11
326 adde r9,r8,r9
327 addze r10,r10
328 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
329 #sqr_add_c2(a,3,0,c1,c2,c3);
330 $LD r6,`3*$BNSZ`(r4)
331 $UMULL r7,r5,r6
332 $UMULH r8,r5,r6
333 addc r7,r7,r7
334 adde r8,r8,r8
335 addze r11,r0
336
337 addc r9,r7,r9
338 adde r10,r8,r10
339 addze r11,r11
340 #sqr_add_c2(a,2,1,c1,c2,c3);
341 $LD r5,`1*$BNSZ`(r4)
342 $LD r6,`2*$BNSZ`(r4)
343 $UMULL r7,r5,r6
344 $UMULH r8,r5,r6
345
346 addc r7,r7,r7
347 adde r8,r8,r8
348 addze r11,r11
349 addc r9,r7,r9
350 adde r10,r8,r10
351 addze r11,r11
352 $ST r9,`3*$BNSZ`(r3) #r[3]=c1
353 #sqr_add_c(a,2,c2,c3,c1);
354 $UMULL r7,r6,r6
355 $UMULH r8,r6,r6
356 addc r10,r7,r10
357 adde r11,r8,r11
358 addze r9,r0
359 #sqr_add_c2(a,3,1,c2,c3,c1);
360 $LD r6,`3*$BNSZ`(r4)
361 $UMULL r7,r5,r6
362 $UMULH r8,r5,r6
363 addc r7,r7,r7
364 adde r8,r8,r8
365 addze r9,r9
366
367 addc r10,r7,r10
368 adde r11,r8,r11
369 addze r9,r9
370 $ST r10,`4*$BNSZ`(r3) #r[4]=c2
371 #sqr_add_c2(a,3,2,c3,c1,c2);
372 $LD r5,`2*$BNSZ`(r4)
373 $UMULL r7,r5,r6
374 $UMULH r8,r5,r6
375 addc r7,r7,r7
376 adde r8,r8,r8
377 addze r10,r0
378
379 addc r11,r7,r11
380 adde r9,r8,r9
381 addze r10,r10
382 $ST r11,`5*$BNSZ`(r3) #r[5] = c3
383 #sqr_add_c(a,3,c1,c2,c3);
384 $UMULL r7,r6,r6
385 $UMULH r8,r6,r6
386 addc r9,r7,r9
387 adde r10,r8,r10
388
389 $ST r9,`6*$BNSZ`(r3) #r[6]=c1
390 $ST r10,`7*$BNSZ`(r3) #r[7]=c2
391 blr
392 .long 0x00000000
393
394#
395# NOTE: The following label name should be changed to
396# "bn_sqr_comba8" i.e. remove the first dot
397# for the gcc compiler. This should be automatically
398# done in the build
399#
400
401.align 4
402.bn_sqr_comba8:
403#
404# This is an optimized version of the bn_sqr_comba8 routine.
405# Tightly uses the adde instruction
406#
407#
408# void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
409# r3 contains r
410# r4 contains a
411#
412# Freely use registers r5,r6,r7,r8,r9,r10,r11 as follows:
413#
414# r5,r6 are the two BN_ULONGs being multiplied.
415# r7,r8 are the results of the 32x32 giving 64 bit multiply.
416# r9,r10, r11 are the equivalents of c1,c2, c3.
417#
418# Possible optimization of loading all 8 longs of a into registers
419# doesnt provide any speedup
420#
421
422 xor r0,r0,r0 #set r0 = 0.Used in addze
423 #instructions below.
424
425 #sqr_add_c(a,0,c1,c2,c3);
426 $LD r5,`0*$BNSZ`(r4)
427 $UMULL r9,r5,r5 #1st iteration: no carries.
428 $UMULH r10,r5,r5
429 $ST r9,`0*$BNSZ`(r3) # r[0]=c1;
430 #sqr_add_c2(a,1,0,c2,c3,c1);
431 $LD r6,`1*$BNSZ`(r4)
432 $UMULL r7,r5,r6
433 $UMULH r8,r5,r6
434
435 addc r10,r7,r10 #add the two register number
436 adde r11,r8,r0 # (r8,r7) to the three register
437 addze r9,r0 # number (r9,r11,r10).NOTE:r0=0
438
439 addc r10,r7,r10 #add the two register number
440 adde r11,r8,r11 # (r8,r7) to the three register
441 addze r9,r9 # number (r9,r11,r10).
442
443 $ST r10,`1*$BNSZ`(r3) # r[1]=c2
444
445 #sqr_add_c(a,1,c3,c1,c2);
446 $UMULL r7,r6,r6
447 $UMULH r8,r6,r6
448 addc r11,r7,r11
449 adde r9,r8,r9
450 addze r10,r0
451 #sqr_add_c2(a,2,0,c3,c1,c2);
452 $LD r6,`2*$BNSZ`(r4)
453 $UMULL r7,r5,r6
454 $UMULH r8,r5,r6
455
456 addc r11,r7,r11
457 adde r9,r8,r9
458 addze r10,r10
459
460 addc r11,r7,r11
461 adde r9,r8,r9
462 addze r10,r10
463
464 $ST r11,`2*$BNSZ`(r3) #r[2]=c3
465 #sqr_add_c2(a,3,0,c1,c2,c3);
466 $LD r6,`3*$BNSZ`(r4) #r6 = a[3]. r5 is already a[0].
467 $UMULL r7,r5,r6
468 $UMULH r8,r5,r6
469
470 addc r9,r7,r9
471 adde r10,r8,r10
472 addze r11,r0
473
474 addc r9,r7,r9
475 adde r10,r8,r10
476 addze r11,r11
477 #sqr_add_c2(a,2,1,c1,c2,c3);
478 $LD r5,`1*$BNSZ`(r4)
479 $LD r6,`2*$BNSZ`(r4)
480 $UMULL r7,r5,r6
481 $UMULH r8,r5,r6
482
483 addc r9,r7,r9
484 adde r10,r8,r10
485 addze r11,r11
486
487 addc r9,r7,r9
488 adde r10,r8,r10
489 addze r11,r11
490
491 $ST r9,`3*$BNSZ`(r3) #r[3]=c1;
492 #sqr_add_c(a,2,c2,c3,c1);
493 $UMULL r7,r6,r6
494 $UMULH r8,r6,r6
495
496 addc r10,r7,r10
497 adde r11,r8,r11
498 addze r9,r0
499 #sqr_add_c2(a,3,1,c2,c3,c1);
500 $LD r6,`3*$BNSZ`(r4)
501 $UMULL r7,r5,r6
502 $UMULH r8,r5,r6
503
504 addc r10,r7,r10
505 adde r11,r8,r11
506 addze r9,r9
507
508 addc r10,r7,r10
509 adde r11,r8,r11
510 addze r9,r9
511 #sqr_add_c2(a,4,0,c2,c3,c1);
512 $LD r5,`0*$BNSZ`(r4)
513 $LD r6,`4*$BNSZ`(r4)
514 $UMULL r7,r5,r6
515 $UMULH r8,r5,r6
516
517 addc r10,r7,r10
518 adde r11,r8,r11
519 addze r9,r9
520
521 addc r10,r7,r10
522 adde r11,r8,r11
523 addze r9,r9
524 $ST r10,`4*$BNSZ`(r3) #r[4]=c2;
525 #sqr_add_c2(a,5,0,c3,c1,c2);
526 $LD r6,`5*$BNSZ`(r4)
527 $UMULL r7,r5,r6
528 $UMULH r8,r5,r6
529
530 addc r11,r7,r11
531 adde r9,r8,r9
532 addze r10,r0
533
534 addc r11,r7,r11
535 adde r9,r8,r9
536 addze r10,r10
537 #sqr_add_c2(a,4,1,c3,c1,c2);
538 $LD r5,`1*$BNSZ`(r4)
539 $LD r6,`4*$BNSZ`(r4)
540 $UMULL r7,r5,r6
541 $UMULH r8,r5,r6
542
543 addc r11,r7,r11
544 adde r9,r8,r9
545 addze r10,r10
546
547 addc r11,r7,r11
548 adde r9,r8,r9
549 addze r10,r10
550 #sqr_add_c2(a,3,2,c3,c1,c2);
551 $LD r5,`2*$BNSZ`(r4)
552 $LD r6,`3*$BNSZ`(r4)
553 $UMULL r7,r5,r6
554 $UMULH r8,r5,r6
555
556 addc r11,r7,r11
557 adde r9,r8,r9
558 addze r10,r10
559
560 addc r11,r7,r11
561 adde r9,r8,r9
562 addze r10,r10
563 $ST r11,`5*$BNSZ`(r3) #r[5]=c3;
564 #sqr_add_c(a,3,c1,c2,c3);
565 $UMULL r7,r6,r6
566 $UMULH r8,r6,r6
567 addc r9,r7,r9
568 adde r10,r8,r10
569 addze r11,r0
570 #sqr_add_c2(a,4,2,c1,c2,c3);
571 $LD r6,`4*$BNSZ`(r4)
572 $UMULL r7,r5,r6
573 $UMULH r8,r5,r6
574
575 addc r9,r7,r9
576 adde r10,r8,r10
577 addze r11,r11
578
579 addc r9,r7,r9
580 adde r10,r8,r10
581 addze r11,r11
582 #sqr_add_c2(a,5,1,c1,c2,c3);
583 $LD r5,`1*$BNSZ`(r4)
584 $LD r6,`5*$BNSZ`(r4)
585 $UMULL r7,r5,r6
586 $UMULH r8,r5,r6
587
588 addc r9,r7,r9
589 adde r10,r8,r10
590 addze r11,r11
591
592 addc r9,r7,r9
593 adde r10,r8,r10
594 addze r11,r11
595 #sqr_add_c2(a,6,0,c1,c2,c3);
596 $LD r5,`0*$BNSZ`(r4)
597 $LD r6,`6*$BNSZ`(r4)
598 $UMULL r7,r5,r6
599 $UMULH r8,r5,r6
600 addc r9,r7,r9
601 adde r10,r8,r10
602 addze r11,r11
603 addc r9,r7,r9
604 adde r10,r8,r10
605 addze r11,r11
606 $ST r9,`6*$BNSZ`(r3) #r[6]=c1;
607 #sqr_add_c2(a,7,0,c2,c3,c1);
608 $LD r6,`7*$BNSZ`(r4)
609 $UMULL r7,r5,r6
610 $UMULH r8,r5,r6
611
612 addc r10,r7,r10
613 adde r11,r8,r11
614 addze r9,r0
615 addc r10,r7,r10
616 adde r11,r8,r11
617 addze r9,r9
618 #sqr_add_c2(a,6,1,c2,c3,c1);
619 $LD r5,`1*$BNSZ`(r4)
620 $LD r6,`6*$BNSZ`(r4)
621 $UMULL r7,r5,r6
622 $UMULH r8,r5,r6
623
624 addc r10,r7,r10
625 adde r11,r8,r11
626 addze r9,r9
627 addc r10,r7,r10
628 adde r11,r8,r11
629 addze r9,r9
630 #sqr_add_c2(a,5,2,c2,c3,c1);
631 $LD r5,`2*$BNSZ`(r4)
632 $LD r6,`5*$BNSZ`(r4)
633 $UMULL r7,r5,r6
634 $UMULH r8,r5,r6
635 addc r10,r7,r10
636 adde r11,r8,r11
637 addze r9,r9
638 addc r10,r7,r10
639 adde r11,r8,r11
640 addze r9,r9
641 #sqr_add_c2(a,4,3,c2,c3,c1);
642 $LD r5,`3*$BNSZ`(r4)
643 $LD r6,`4*$BNSZ`(r4)
644 $UMULL r7,r5,r6
645 $UMULH r8,r5,r6
646
647 addc r10,r7,r10
648 adde r11,r8,r11
649 addze r9,r9
650 addc r10,r7,r10
651 adde r11,r8,r11
652 addze r9,r9
653 $ST r10,`7*$BNSZ`(r3) #r[7]=c2;
654 #sqr_add_c(a,4,c3,c1,c2);
655 $UMULL r7,r6,r6
656 $UMULH r8,r6,r6
657 addc r11,r7,r11
658 adde r9,r8,r9
659 addze r10,r0
660 #sqr_add_c2(a,5,3,c3,c1,c2);
661 $LD r6,`5*$BNSZ`(r4)
662 $UMULL r7,r5,r6
663 $UMULH r8,r5,r6
664 addc r11,r7,r11
665 adde r9,r8,r9
666 addze r10,r10
667 addc r11,r7,r11
668 adde r9,r8,r9
669 addze r10,r10
670 #sqr_add_c2(a,6,2,c3,c1,c2);
671 $LD r5,`2*$BNSZ`(r4)
672 $LD r6,`6*$BNSZ`(r4)
673 $UMULL r7,r5,r6
674 $UMULH r8,r5,r6
675 addc r11,r7,r11
676 adde r9,r8,r9
677 addze r10,r10
678
679 addc r11,r7,r11
680 adde r9,r8,r9
681 addze r10,r10
682 #sqr_add_c2(a,7,1,c3,c1,c2);
683 $LD r5,`1*$BNSZ`(r4)
684 $LD r6,`7*$BNSZ`(r4)
685 $UMULL r7,r5,r6
686 $UMULH r8,r5,r6
687 addc r11,r7,r11
688 adde r9,r8,r9
689 addze r10,r10
690 addc r11,r7,r11
691 adde r9,r8,r9
692 addze r10,r10
693 $ST r11,`8*$BNSZ`(r3) #r[8]=c3;
694 #sqr_add_c2(a,7,2,c1,c2,c3);
695 $LD r5,`2*$BNSZ`(r4)
696 $UMULL r7,r5,r6
697 $UMULH r8,r5,r6
698
699 addc r9,r7,r9
700 adde r10,r8,r10
701 addze r11,r0
702 addc r9,r7,r9
703 adde r10,r8,r10
704 addze r11,r11
705 #sqr_add_c2(a,6,3,c1,c2,c3);
706 $LD r5,`3*$BNSZ`(r4)
707 $LD r6,`6*$BNSZ`(r4)
708 $UMULL r7,r5,r6
709 $UMULH r8,r5,r6
710 addc r9,r7,r9
711 adde r10,r8,r10
712 addze r11,r11
713 addc r9,r7,r9
714 adde r10,r8,r10
715 addze r11,r11
716 #sqr_add_c2(a,5,4,c1,c2,c3);
717 $LD r5,`4*$BNSZ`(r4)
718 $LD r6,`5*$BNSZ`(r4)
719 $UMULL r7,r5,r6
720 $UMULH r8,r5,r6
721 addc r9,r7,r9
722 adde r10,r8,r10
723 addze r11,r11
724 addc r9,r7,r9
725 adde r10,r8,r10
726 addze r11,r11
727 $ST r9,`9*$BNSZ`(r3) #r[9]=c1;
728 #sqr_add_c(a,5,c2,c3,c1);
729 $UMULL r7,r6,r6
730 $UMULH r8,r6,r6
731 addc r10,r7,r10
732 adde r11,r8,r11
733 addze r9,r0
734 #sqr_add_c2(a,6,4,c2,c3,c1);
735 $LD r6,`6*$BNSZ`(r4)
736 $UMULL r7,r5,r6
737 $UMULH r8,r5,r6
738 addc r10,r7,r10
739 adde r11,r8,r11
740 addze r9,r9
741 addc r10,r7,r10
742 adde r11,r8,r11
743 addze r9,r9
744 #sqr_add_c2(a,7,3,c2,c3,c1);
745 $LD r5,`3*$BNSZ`(r4)
746 $LD r6,`7*$BNSZ`(r4)
747 $UMULL r7,r5,r6
748 $UMULH r8,r5,r6
749 addc r10,r7,r10
750 adde r11,r8,r11
751 addze r9,r9
752 addc r10,r7,r10
753 adde r11,r8,r11
754 addze r9,r9
755 $ST r10,`10*$BNSZ`(r3) #r[10]=c2;
756 #sqr_add_c2(a,7,4,c3,c1,c2);
757 $LD r5,`4*$BNSZ`(r4)
758 $UMULL r7,r5,r6
759 $UMULH r8,r5,r6
760 addc r11,r7,r11
761 adde r9,r8,r9
762 addze r10,r0
763 addc r11,r7,r11
764 adde r9,r8,r9
765 addze r10,r10
766 #sqr_add_c2(a,6,5,c3,c1,c2);
767 $LD r5,`5*$BNSZ`(r4)
768 $LD r6,`6*$BNSZ`(r4)
769 $UMULL r7,r5,r6
770 $UMULH r8,r5,r6
771 addc r11,r7,r11
772 adde r9,r8,r9
773 addze r10,r10
774 addc r11,r7,r11
775 adde r9,r8,r9
776 addze r10,r10
777 $ST r11,`11*$BNSZ`(r3) #r[11]=c3;
778 #sqr_add_c(a,6,c1,c2,c3);
779 $UMULL r7,r6,r6
780 $UMULH r8,r6,r6
781 addc r9,r7,r9
782 adde r10,r8,r10
783 addze r11,r0
784 #sqr_add_c2(a,7,5,c1,c2,c3)
785 $LD r6,`7*$BNSZ`(r4)
786 $UMULL r7,r5,r6
787 $UMULH r8,r5,r6
788 addc r9,r7,r9
789 adde r10,r8,r10
790 addze r11,r11
791 addc r9,r7,r9
792 adde r10,r8,r10
793 addze r11,r11
794 $ST r9,`12*$BNSZ`(r3) #r[12]=c1;
795
796 #sqr_add_c2(a,7,6,c2,c3,c1)
797 $LD r5,`6*$BNSZ`(r4)
798 $UMULL r7,r5,r6
799 $UMULH r8,r5,r6
800 addc r10,r7,r10
801 adde r11,r8,r11
802 addze r9,r0
803 addc r10,r7,r10
804 adde r11,r8,r11
805 addze r9,r9
806 $ST r10,`13*$BNSZ`(r3) #r[13]=c2;
807 #sqr_add_c(a,7,c3,c1,c2);
808 $UMULL r7,r6,r6
809 $UMULH r8,r6,r6
810 addc r11,r7,r11
811 adde r9,r8,r9
812 $ST r11,`14*$BNSZ`(r3) #r[14]=c3;
813 $ST r9, `15*$BNSZ`(r3) #r[15]=c1;
814
815
816 blr
817
818 .long 0x00000000
819
820#
821# NOTE: The following label name should be changed to
822# "bn_mul_comba4" i.e. remove the first dot
823# for the gcc compiler. This should be automatically
824# done in the build
825#
826
827.align 4
828.bn_mul_comba4:
829#
830# This is an optimized version of the bn_mul_comba4 routine.
831#
832# void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
833# r3 contains r
834# r4 contains a
835# r5 contains b
836# r6, r7 are the 2 BN_ULONGs being multiplied.
837# r8, r9 are the results of the 32x32 giving 64 multiply.
838# r10, r11, r12 are the equivalents of c1, c2, and c3.
839#
840 xor r0,r0,r0 #r0=0. Used in addze below.
841 #mul_add_c(a[0],b[0],c1,c2,c3);
842 $LD r6,`0*$BNSZ`(r4)
843 $LD r7,`0*$BNSZ`(r5)
844 $UMULL r10,r6,r7
845 $UMULH r11,r6,r7
846 $ST r10,`0*$BNSZ`(r3) #r[0]=c1
847 #mul_add_c(a[0],b[1],c2,c3,c1);
848 $LD r7,`1*$BNSZ`(r5)
849 $UMULL r8,r6,r7
850 $UMULH r9,r6,r7
851 addc r11,r8,r11
852 adde r12,r9,r0
853 addze r10,r0
854 #mul_add_c(a[1],b[0],c2,c3,c1);
855 $LD r6, `1*$BNSZ`(r4)
856 $LD r7, `0*$BNSZ`(r5)
857 $UMULL r8,r6,r7
858 $UMULH r9,r6,r7
859 addc r11,r8,r11
860 adde r12,r9,r12
861 addze r10,r10
862 $ST r11,`1*$BNSZ`(r3) #r[1]=c2
863 #mul_add_c(a[2],b[0],c3,c1,c2);
864 $LD r6,`2*$BNSZ`(r4)
865 $UMULL r8,r6,r7
866 $UMULH r9,r6,r7
867 addc r12,r8,r12
868 adde r10,r9,r10
869 addze r11,r0
870 #mul_add_c(a[1],b[1],c3,c1,c2);
871 $LD r6,`1*$BNSZ`(r4)
872 $LD r7,`1*$BNSZ`(r5)
873 $UMULL r8,r6,r7
874 $UMULH r9,r6,r7
875 addc r12,r8,r12
876 adde r10,r9,r10
877 addze r11,r11
878 #mul_add_c(a[0],b[2],c3,c1,c2);
879 $LD r6,`0*$BNSZ`(r4)
880 $LD r7,`2*$BNSZ`(r5)
881 $UMULL r8,r6,r7
882 $UMULH r9,r6,r7
883 addc r12,r8,r12
884 adde r10,r9,r10
885 addze r11,r11
886 $ST r12,`2*$BNSZ`(r3) #r[2]=c3
887 #mul_add_c(a[0],b[3],c1,c2,c3);
888 $LD r7,`3*$BNSZ`(r5)
889 $UMULL r8,r6,r7
890 $UMULH r9,r6,r7
891 addc r10,r8,r10
892 adde r11,r9,r11
893 addze r12,r0
894 #mul_add_c(a[1],b[2],c1,c2,c3);
895 $LD r6,`1*$BNSZ`(r4)
896 $LD r7,`2*$BNSZ`(r5)
897 $UMULL r8,r6,r7
898 $UMULH r9,r6,r7
899 addc r10,r8,r10
900 adde r11,r9,r11
901 addze r12,r12
902 #mul_add_c(a[2],b[1],c1,c2,c3);
903 $LD r6,`2*$BNSZ`(r4)
904 $LD r7,`1*$BNSZ`(r5)
905 $UMULL r8,r6,r7
906 $UMULH r9,r6,r7
907 addc r10,r8,r10
908 adde r11,r9,r11
909 addze r12,r12
910 #mul_add_c(a[3],b[0],c1,c2,c3);
911 $LD r6,`3*$BNSZ`(r4)
912 $LD r7,`0*$BNSZ`(r5)
913 $UMULL r8,r6,r7
914 $UMULH r9,r6,r7
915 addc r10,r8,r10
916 adde r11,r9,r11
917 addze r12,r12
918 $ST r10,`3*$BNSZ`(r3) #r[3]=c1
919 #mul_add_c(a[3],b[1],c2,c3,c1);
920 $LD r7,`1*$BNSZ`(r5)
921 $UMULL r8,r6,r7
922 $UMULH r9,r6,r7
923 addc r11,r8,r11
924 adde r12,r9,r12
925 addze r10,r0
926 #mul_add_c(a[2],b[2],c2,c3,c1);
927 $LD r6,`2*$BNSZ`(r4)
928 $LD r7,`2*$BNSZ`(r5)
929 $UMULL r8,r6,r7
930 $UMULH r9,r6,r7
931 addc r11,r8,r11
932 adde r12,r9,r12
933 addze r10,r10
934 #mul_add_c(a[1],b[3],c2,c3,c1);
935 $LD r6,`1*$BNSZ`(r4)
936 $LD r7,`3*$BNSZ`(r5)
937 $UMULL r8,r6,r7
938 $UMULH r9,r6,r7
939 addc r11,r8,r11
940 adde r12,r9,r12
941 addze r10,r10
942 $ST r11,`4*$BNSZ`(r3) #r[4]=c2
943 #mul_add_c(a[2],b[3],c3,c1,c2);
944 $LD r6,`2*$BNSZ`(r4)
945 $UMULL r8,r6,r7
946 $UMULH r9,r6,r7
947 addc r12,r8,r12
948 adde r10,r9,r10
949 addze r11,r0
950 #mul_add_c(a[3],b[2],c3,c1,c2);
951 $LD r6,`3*$BNSZ`(r4)
952 $LD r7,`2*$BNSZ`(r4)
953 $UMULL r8,r6,r7
954 $UMULH r9,r6,r7
955 addc r12,r8,r12
956 adde r10,r9,r10
957 addze r11,r11
958 $ST r12,`5*$BNSZ`(r3) #r[5]=c3
959 #mul_add_c(a[3],b[3],c1,c2,c3);
960 $LD r7,`3*$BNSZ`(r5)
961 $UMULL r8,r6,r7
962 $UMULH r9,r6,r7
963 addc r10,r8,r10
964 adde r11,r9,r11
965
966 $ST r10,`6*$BNSZ`(r3) #r[6]=c1
967 $ST r11,`7*$BNSZ`(r3) #r[7]=c2
968 blr
969 .long 0x00000000
970
971#
972# NOTE: The following label name should be changed to
973# "bn_mul_comba8" i.e. remove the first dot
974# for the gcc compiler. This should be automatically
975# done in the build
976#
977
978.align 4
979.bn_mul_comba8:
980#
981# Optimized version of the bn_mul_comba8 routine.
982#
983# void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
984# r3 contains r
985# r4 contains a
986# r5 contains b
987# r6, r7 are the 2 BN_ULONGs being multiplied.
988# r8, r9 are the results of the 32x32 giving 64 multiply.
989# r10, r11, r12 are the equivalents of c1, c2, and c3.
990#
991 xor r0,r0,r0 #r0=0. Used in addze below.
992
993 #mul_add_c(a[0],b[0],c1,c2,c3);
994 $LD r6,`0*$BNSZ`(r4) #a[0]
995 $LD r7,`0*$BNSZ`(r5) #b[0]
996 $UMULL r10,r6,r7
997 $UMULH r11,r6,r7
998 $ST r10,`0*$BNSZ`(r3) #r[0]=c1;
999 #mul_add_c(a[0],b[1],c2,c3,c1);
1000 $LD r7,`1*$BNSZ`(r5)
1001 $UMULL r8,r6,r7
1002 $UMULH r9,r6,r7
1003 addc r11,r11,r8
1004 addze r12,r9 # since we didnt set r12 to zero before.
1005 addze r10,r0
1006 #mul_add_c(a[1],b[0],c2,c3,c1);
1007 $LD r6,`1*$BNSZ`(r4)
1008 $LD r7,`0*$BNSZ`(r5)
1009 $UMULL r8,r6,r7
1010 $UMULH r9,r6,r7
1011 addc r11,r11,r8
1012 adde r12,r12,r9
1013 addze r10,r10
1014 $ST r11,`1*$BNSZ`(r3) #r[1]=c2;
1015 #mul_add_c(a[2],b[0],c3,c1,c2);
1016 $LD r6,`2*$BNSZ`(r4)
1017 $UMULL r8,r6,r7
1018 $UMULH r9,r6,r7
1019 addc r12,r12,r8
1020 adde r10,r10,r9
1021 addze r11,r0
1022 #mul_add_c(a[1],b[1],c3,c1,c2);
1023 $LD r6,`1*$BNSZ`(r4)
1024 $LD r7,`1*$BNSZ`(r5)
1025 $UMULL r8,r6,r7
1026 $UMULH r9,r6,r7
1027 addc r12,r12,r8
1028 adde r10,r10,r9
1029 addze r11,r11
1030 #mul_add_c(a[0],b[2],c3,c1,c2);
1031 $LD r6,`0*$BNSZ`(r4)
1032 $LD r7,`2*$BNSZ`(r5)
1033 $UMULL r8,r6,r7
1034 $UMULH r9,r6,r7
1035 addc r12,r12,r8
1036 adde r10,r10,r9
1037 addze r11,r11
1038 $ST r12,`2*$BNSZ`(r3) #r[2]=c3;
1039 #mul_add_c(a[0],b[3],c1,c2,c3);
1040 $LD r7,`3*$BNSZ`(r5)
1041 $UMULL r8,r6,r7
1042 $UMULH r9,r6,r7
1043 addc r10,r10,r8
1044 adde r11,r11,r9
1045 addze r12,r0
1046 #mul_add_c(a[1],b[2],c1,c2,c3);
1047 $LD r6,`1*$BNSZ`(r4)
1048 $LD r7,`2*$BNSZ`(r5)
1049 $UMULL r8,r6,r7
1050 $UMULH r9,r6,r7
1051 addc r10,r10,r8
1052 adde r11,r11,r9
1053 addze r12,r12
1054
1055 #mul_add_c(a[2],b[1],c1,c2,c3);
1056 $LD r6,`2*$BNSZ`(r4)
1057 $LD r7,`1*$BNSZ`(r5)
1058 $UMULL r8,r6,r7
1059 $UMULH r9,r6,r7
1060 addc r10,r10,r8
1061 adde r11,r11,r9
1062 addze r12,r12
1063 #mul_add_c(a[3],b[0],c1,c2,c3);
1064 $LD r6,`3*$BNSZ`(r4)
1065 $LD r7,`0*$BNSZ`(r5)
1066 $UMULL r8,r6,r7
1067 $UMULH r9,r6,r7
1068 addc r10,r10,r8
1069 adde r11,r11,r9
1070 addze r12,r12
1071 $ST r10,`3*$BNSZ`(r3) #r[3]=c1;
1072 #mul_add_c(a[4],b[0],c2,c3,c1);
1073 $LD r6,`4*$BNSZ`(r4)
1074 $UMULL r8,r6,r7
1075 $UMULH r9,r6,r7
1076 addc r11,r11,r8
1077 adde r12,r12,r9
1078 addze r10,r0
1079 #mul_add_c(a[3],b[1],c2,c3,c1);
1080 $LD r6,`3*$BNSZ`(r4)
1081 $LD r7,`1*$BNSZ`(r5)
1082 $UMULL r8,r6,r7
1083 $UMULH r9,r6,r7
1084 addc r11,r11,r8
1085 adde r12,r12,r9
1086 addze r10,r10
1087 #mul_add_c(a[2],b[2],c2,c3,c1);
1088 $LD r6,`2*$BNSZ`(r4)
1089 $LD r7,`2*$BNSZ`(r5)
1090 $UMULL r8,r6,r7
1091 $UMULH r9,r6,r7
1092 addc r11,r11,r8
1093 adde r12,r12,r9
1094 addze r10,r10
1095 #mul_add_c(a[1],b[3],c2,c3,c1);
1096 $LD r6,`1*$BNSZ`(r4)
1097 $LD r7,`3*$BNSZ`(r5)
1098 $UMULL r8,r6,r7
1099 $UMULH r9,r6,r7
1100 addc r11,r11,r8
1101 adde r12,r12,r9
1102 addze r10,r10
1103 #mul_add_c(a[0],b[4],c2,c3,c1);
1104 $LD r6,`0*$BNSZ`(r4)
1105 $LD r7,`4*$BNSZ`(r5)
1106 $UMULL r8,r6,r7
1107 $UMULH r9,r6,r7
1108 addc r11,r11,r8
1109 adde r12,r12,r9
1110 addze r10,r10
1111 $ST r11,`4*$BNSZ`(r3) #r[4]=c2;
1112 #mul_add_c(a[0],b[5],c3,c1,c2);
1113 $LD r7,`5*$BNSZ`(r5)
1114 $UMULL r8,r6,r7
1115 $UMULH r9,r6,r7
1116 addc r12,r12,r8
1117 adde r10,r10,r9
1118 addze r11,r0
1119 #mul_add_c(a[1],b[4],c3,c1,c2);
1120 $LD r6,`1*$BNSZ`(r4)
1121 $LD r7,`4*$BNSZ`(r5)
1122 $UMULL r8,r6,r7
1123 $UMULH r9,r6,r7
1124 addc r12,r12,r8
1125 adde r10,r10,r9
1126 addze r11,r11
1127 #mul_add_c(a[2],b[3],c3,c1,c2);
1128 $LD r6,`2*$BNSZ`(r4)
1129 $LD r7,`3*$BNSZ`(r5)
1130 $UMULL r8,r6,r7
1131 $UMULH r9,r6,r7
1132 addc r12,r12,r8
1133 adde r10,r10,r9
1134 addze r11,r11
1135 #mul_add_c(a[3],b[2],c3,c1,c2);
1136 $LD r6,`3*$BNSZ`(r4)
1137 $LD r7,`2*$BNSZ`(r5)
1138 $UMULL r8,r6,r7
1139 $UMULH r9,r6,r7
1140 addc r12,r12,r8
1141 adde r10,r10,r9
1142 addze r11,r11
1143 #mul_add_c(a[4],b[1],c3,c1,c2);
1144 $LD r6,`4*$BNSZ`(r4)
1145 $LD r7,`1*$BNSZ`(r5)
1146 $UMULL r8,r6,r7
1147 $UMULH r9,r6,r7
1148 addc r12,r12,r8
1149 adde r10,r10,r9
1150 addze r11,r11
1151 #mul_add_c(a[5],b[0],c3,c1,c2);
1152 $LD r6,`5*$BNSZ`(r4)
1153 $LD r7,`0*$BNSZ`(r5)
1154 $UMULL r8,r6,r7
1155 $UMULH r9,r6,r7
1156 addc r12,r12,r8
1157 adde r10,r10,r9
1158 addze r11,r11
1159 $ST r12,`5*$BNSZ`(r3) #r[5]=c3;
1160 #mul_add_c(a[6],b[0],c1,c2,c3);
1161 $LD r6,`6*$BNSZ`(r4)
1162 $UMULL r8,r6,r7
1163 $UMULH r9,r6,r7
1164 addc r10,r10,r8
1165 adde r11,r11,r9
1166 addze r12,r0
1167 #mul_add_c(a[5],b[1],c1,c2,c3);
1168 $LD r6,`5*$BNSZ`(r4)
1169 $LD r7,`1*$BNSZ`(r5)
1170 $UMULL r8,r6,r7
1171 $UMULH r9,r6,r7
1172 addc r10,r10,r8
1173 adde r11,r11,r9
1174 addze r12,r12
1175 #mul_add_c(a[4],b[2],c1,c2,c3);
1176 $LD r6,`4*$BNSZ`(r4)
1177 $LD r7,`2*$BNSZ`(r5)
1178 $UMULL r8,r6,r7
1179 $UMULH r9,r6,r7
1180 addc r10,r10,r8
1181 adde r11,r11,r9
1182 addze r12,r12
1183 #mul_add_c(a[3],b[3],c1,c2,c3);
1184 $LD r6,`3*$BNSZ`(r4)
1185 $LD r7,`3*$BNSZ`(r5)
1186 $UMULL r8,r6,r7
1187 $UMULH r9,r6,r7
1188 addc r10,r10,r8
1189 adde r11,r11,r9
1190 addze r12,r12
1191 #mul_add_c(a[2],b[4],c1,c2,c3);
1192 $LD r6,`2*$BNSZ`(r4)
1193 $LD r7,`4*$BNSZ`(r5)
1194 $UMULL r8,r6,r7
1195 $UMULH r9,r6,r7
1196 addc r10,r10,r8
1197 adde r11,r11,r9
1198 addze r12,r12
1199 #mul_add_c(a[1],b[5],c1,c2,c3);
1200 $LD r6,`1*$BNSZ`(r4)
1201 $LD r7,`5*$BNSZ`(r5)
1202 $UMULL r8,r6,r7
1203 $UMULH r9,r6,r7
1204 addc r10,r10,r8
1205 adde r11,r11,r9
1206 addze r12,r12
1207 #mul_add_c(a[0],b[6],c1,c2,c3);
1208 $LD r6,`0*$BNSZ`(r4)
1209 $LD r7,`6*$BNSZ`(r5)
1210 $UMULL r8,r6,r7
1211 $UMULH r9,r6,r7
1212 addc r10,r10,r8
1213 adde r11,r11,r9
1214 addze r12,r12
1215 $ST r10,`6*$BNSZ`(r3) #r[6]=c1;
1216 #mul_add_c(a[0],b[7],c2,c3,c1);
1217 $LD r7,`7*$BNSZ`(r5)
1218 $UMULL r8,r6,r7
1219 $UMULH r9,r6,r7
1220 addc r11,r11,r8
1221 adde r12,r12,r9
1222 addze r10,r0
1223 #mul_add_c(a[1],b[6],c2,c3,c1);
1224 $LD r6,`1*$BNSZ`(r4)
1225 $LD r7,`6*$BNSZ`(r5)
1226 $UMULL r8,r6,r7
1227 $UMULH r9,r6,r7
1228 addc r11,r11,r8
1229 adde r12,r12,r9
1230 addze r10,r10
1231 #mul_add_c(a[2],b[5],c2,c3,c1);
1232 $LD r6,`2*$BNSZ`(r4)
1233 $LD r7,`5*$BNSZ`(r5)
1234 $UMULL r8,r6,r7
1235 $UMULH r9,r6,r7
1236 addc r11,r11,r8
1237 adde r12,r12,r9
1238 addze r10,r10
1239 #mul_add_c(a[3],b[4],c2,c3,c1);
1240 $LD r6,`3*$BNSZ`(r4)
1241 $LD r7,`4*$BNSZ`(r5)
1242 $UMULL r8,r6,r7
1243 $UMULH r9,r6,r7
1244 addc r11,r11,r8
1245 adde r12,r12,r9
1246 addze r10,r10
1247 #mul_add_c(a[4],b[3],c2,c3,c1);
1248 $LD r6,`4*$BNSZ`(r4)
1249 $LD r7,`3*$BNSZ`(r5)
1250 $UMULL r8,r6,r7
1251 $UMULH r9,r6,r7
1252 addc r11,r11,r8
1253 adde r12,r12,r9
1254 addze r10,r10
1255 #mul_add_c(a[5],b[2],c2,c3,c1);
1256 $LD r6,`5*$BNSZ`(r4)
1257 $LD r7,`2*$BNSZ`(r5)
1258 $UMULL r8,r6,r7
1259 $UMULH r9,r6,r7
1260 addc r11,r11,r8
1261 adde r12,r12,r9
1262 addze r10,r10
1263 #mul_add_c(a[6],b[1],c2,c3,c1);
1264 $LD r6,`6*$BNSZ`(r4)
1265 $LD r7,`1*$BNSZ`(r5)
1266 $UMULL r8,r6,r7
1267 $UMULH r9,r6,r7
1268 addc r11,r11,r8
1269 adde r12,r12,r9
1270 addze r10,r10
1271 #mul_add_c(a[7],b[0],c2,c3,c1);
1272 $LD r6,`7*$BNSZ`(r4)
1273 $LD r7,`0*$BNSZ`(r5)
1274 $UMULL r8,r6,r7
1275 $UMULH r9,r6,r7
1276 addc r11,r11,r8
1277 adde r12,r12,r9
1278 addze r10,r10
1279 $ST r11,`7*$BNSZ`(r3) #r[7]=c2;
1280 #mul_add_c(a[7],b[1],c3,c1,c2);
1281 $LD r7,`1*$BNSZ`(r5)
1282 $UMULL r8,r6,r7
1283 $UMULH r9,r6,r7
1284 addc r12,r12,r8
1285 adde r10,r10,r9
1286 addze r11,r0
1287 #mul_add_c(a[6],b[2],c3,c1,c2);
1288 $LD r6,`6*$BNSZ`(r4)
1289 $LD r7,`2*$BNSZ`(r5)
1290 $UMULL r8,r6,r7
1291 $UMULH r9,r6,r7
1292 addc r12,r12,r8
1293 adde r10,r10,r9
1294 addze r11,r11
1295 #mul_add_c(a[5],b[3],c3,c1,c2);
1296 $LD r6,`5*$BNSZ`(r4)
1297 $LD r7,`3*$BNSZ`(r5)
1298 $UMULL r8,r6,r7
1299 $UMULH r9,r6,r7
1300 addc r12,r12,r8
1301 adde r10,r10,r9
1302 addze r11,r11
1303 #mul_add_c(a[4],b[4],c3,c1,c2);
1304 $LD r6,`4*$BNSZ`(r4)
1305 $LD r7,`4*$BNSZ`(r5)
1306 $UMULL r8,r6,r7
1307 $UMULH r9,r6,r7
1308 addc r12,r12,r8
1309 adde r10,r10,r9
1310 addze r11,r11
1311 #mul_add_c(a[3],b[5],c3,c1,c2);
1312 $LD r6,`3*$BNSZ`(r4)
1313 $LD r7,`5*$BNSZ`(r5)
1314 $UMULL r8,r6,r7
1315 $UMULH r9,r6,r7
1316 addc r12,r12,r8
1317 adde r10,r10,r9
1318 addze r11,r11
1319 #mul_add_c(a[2],b[6],c3,c1,c2);
1320 $LD r6,`2*$BNSZ`(r4)
1321 $LD r7,`6*$BNSZ`(r5)
1322 $UMULL r8,r6,r7
1323 $UMULH r9,r6,r7
1324 addc r12,r12,r8
1325 adde r10,r10,r9
1326 addze r11,r11
1327 #mul_add_c(a[1],b[7],c3,c1,c2);
1328 $LD r6,`1*$BNSZ`(r4)
1329 $LD r7,`7*$BNSZ`(r5)
1330 $UMULL r8,r6,r7
1331 $UMULH r9,r6,r7
1332 addc r12,r12,r8
1333 adde r10,r10,r9
1334 addze r11,r11
1335 $ST r12,`8*$BNSZ`(r3) #r[8]=c3;
1336 #mul_add_c(a[2],b[7],c1,c2,c3);
1337 $LD r6,`2*$BNSZ`(r4)
1338 $UMULL r8,r6,r7
1339 $UMULH r9,r6,r7
1340 addc r10,r10,r8
1341 adde r11,r11,r9
1342 addze r12,r0
1343 #mul_add_c(a[3],b[6],c1,c2,c3);
1344 $LD r6,`3*$BNSZ`(r4)
1345 $LD r7,`6*$BNSZ`(r5)
1346 $UMULL r8,r6,r7
1347 $UMULH r9,r6,r7
1348 addc r10,r10,r8
1349 adde r11,r11,r9
1350 addze r12,r12
1351 #mul_add_c(a[4],b[5],c1,c2,c3);
1352 $LD r6,`4*$BNSZ`(r4)
1353 $LD r7,`5*$BNSZ`(r5)
1354 $UMULL r8,r6,r7
1355 $UMULH r9,r6,r7
1356 addc r10,r10,r8
1357 adde r11,r11,r9
1358 addze r12,r12
1359 #mul_add_c(a[5],b[4],c1,c2,c3);
1360 $LD r6,`5*$BNSZ`(r4)
1361 $LD r7,`4*$BNSZ`(r5)
1362 $UMULL r8,r6,r7
1363 $UMULH r9,r6,r7
1364 addc r10,r10,r8
1365 adde r11,r11,r9
1366 addze r12,r12
1367 #mul_add_c(a[6],b[3],c1,c2,c3);
1368 $LD r6,`6*$BNSZ`(r4)
1369 $LD r7,`3*$BNSZ`(r5)
1370 $UMULL r8,r6,r7
1371 $UMULH r9,r6,r7
1372 addc r10,r10,r8
1373 adde r11,r11,r9
1374 addze r12,r12
1375 #mul_add_c(a[7],b[2],c1,c2,c3);
1376 $LD r6,`7*$BNSZ`(r4)
1377 $LD r7,`2*$BNSZ`(r5)
1378 $UMULL r8,r6,r7
1379 $UMULH r9,r6,r7
1380 addc r10,r10,r8
1381 adde r11,r11,r9
1382 addze r12,r12
1383 $ST r10,`9*$BNSZ`(r3) #r[9]=c1;
1384 #mul_add_c(a[7],b[3],c2,c3,c1);
1385 $LD r7,`3*$BNSZ`(r5)
1386 $UMULL r8,r6,r7
1387 $UMULH r9,r6,r7
1388 addc r11,r11,r8
1389 adde r12,r12,r9
1390 addze r10,r0
1391 #mul_add_c(a[6],b[4],c2,c3,c1);
1392 $LD r6,`6*$BNSZ`(r4)
1393 $LD r7,`4*$BNSZ`(r5)
1394 $UMULL r8,r6,r7
1395 $UMULH r9,r6,r7
1396 addc r11,r11,r8
1397 adde r12,r12,r9
1398 addze r10,r10
1399 #mul_add_c(a[5],b[5],c2,c3,c1);
1400 $LD r6,`5*$BNSZ`(r4)
1401 $LD r7,`5*$BNSZ`(r5)
1402 $UMULL r8,r6,r7
1403 $UMULH r9,r6,r7
1404 addc r11,r11,r8
1405 adde r12,r12,r9
1406 addze r10,r10
1407 #mul_add_c(a[4],b[6],c2,c3,c1);
1408 $LD r6,`4*$BNSZ`(r4)
1409 $LD r7,`6*$BNSZ`(r5)
1410 $UMULL r8,r6,r7
1411 $UMULH r9,r6,r7
1412 addc r11,r11,r8
1413 adde r12,r12,r9
1414 addze r10,r10
1415 #mul_add_c(a[3],b[7],c2,c3,c1);
1416 $LD r6,`3*$BNSZ`(r4)
1417 $LD r7,`7*$BNSZ`(r5)
1418 $UMULL r8,r6,r7
1419 $UMULH r9,r6,r7
1420 addc r11,r11,r8
1421 adde r12,r12,r9
1422 addze r10,r10
1423 $ST r11,`10*$BNSZ`(r3) #r[10]=c2;
1424 #mul_add_c(a[4],b[7],c3,c1,c2);
1425 $LD r6,`4*$BNSZ`(r4)
1426 $UMULL r8,r6,r7
1427 $UMULH r9,r6,r7
1428 addc r12,r12,r8
1429 adde r10,r10,r9
1430 addze r11,r0
1431 #mul_add_c(a[5],b[6],c3,c1,c2);
1432 $LD r6,`5*$BNSZ`(r4)
1433 $LD r7,`6*$BNSZ`(r5)
1434 $UMULL r8,r6,r7
1435 $UMULH r9,r6,r7
1436 addc r12,r12,r8
1437 adde r10,r10,r9
1438 addze r11,r11
1439 #mul_add_c(a[6],b[5],c3,c1,c2);
1440 $LD r6,`6*$BNSZ`(r4)
1441 $LD r7,`5*$BNSZ`(r5)
1442 $UMULL r8,r6,r7
1443 $UMULH r9,r6,r7
1444 addc r12,r12,r8
1445 adde r10,r10,r9
1446 addze r11,r11
1447 #mul_add_c(a[7],b[4],c3,c1,c2);
1448 $LD r6,`7*$BNSZ`(r4)
1449 $LD r7,`4*$BNSZ`(r5)
1450 $UMULL r8,r6,r7
1451 $UMULH r9,r6,r7
1452 addc r12,r12,r8
1453 adde r10,r10,r9
1454 addze r11,r11
1455 $ST r12,`11*$BNSZ`(r3) #r[11]=c3;
1456 #mul_add_c(a[7],b[5],c1,c2,c3);
1457 $LD r7,`5*$BNSZ`(r5)
1458 $UMULL r8,r6,r7
1459 $UMULH r9,r6,r7
1460 addc r10,r10,r8
1461 adde r11,r11,r9
1462 addze r12,r0
1463 #mul_add_c(a[6],b[6],c1,c2,c3);
1464 $LD r6,`6*$BNSZ`(r4)
1465 $LD r7,`6*$BNSZ`(r5)
1466 $UMULL r8,r6,r7
1467 $UMULH r9,r6,r7
1468 addc r10,r10,r8
1469 adde r11,r11,r9
1470 addze r12,r12
1471 #mul_add_c(a[5],b[7],c1,c2,c3);
1472 $LD r6,`5*$BNSZ`(r4)
1473 $LD r7,`7*$BNSZ`(r5)
1474 $UMULL r8,r6,r7
1475 $UMULH r9,r6,r7
1476 addc r10,r10,r8
1477 adde r11,r11,r9
1478 addze r12,r12
1479 $ST r10,`12*$BNSZ`(r3) #r[12]=c1;
1480 #mul_add_c(a[6],b[7],c2,c3,c1);
1481 $LD r6,`6*$BNSZ`(r4)
1482 $UMULL r8,r6,r7
1483 $UMULH r9,r6,r7
1484 addc r11,r11,r8
1485 adde r12,r12,r9
1486 addze r10,r0
1487 #mul_add_c(a[7],b[6],c2,c3,c1);
1488 $LD r6,`7*$BNSZ`(r4)
1489 $LD r7,`6*$BNSZ`(r5)
1490 $UMULL r8,r6,r7
1491 $UMULH r9,r6,r7
1492 addc r11,r11,r8
1493 adde r12,r12,r9
1494 addze r10,r10
1495 $ST r11,`13*$BNSZ`(r3) #r[13]=c2;
1496 #mul_add_c(a[7],b[7],c3,c1,c2);
1497 $LD r7,`7*$BNSZ`(r5)
1498 $UMULL r8,r6,r7
1499 $UMULH r9,r6,r7
1500 addc r12,r12,r8
1501 adde r10,r10,r9
1502 $ST r12,`14*$BNSZ`(r3) #r[14]=c3;
1503 $ST r10,`15*$BNSZ`(r3) #r[15]=c1;
1504 blr
1505 .long 0x00000000
1506
1507#
1508# NOTE: The following label name should be changed to
1509# "bn_sub_words" i.e. remove the first dot
1510# for the gcc compiler. This should be automatically
1511# done in the build
1512#
1513#
1514.align 4
1515.bn_sub_words:
1516#
1517# Handcoded version of bn_sub_words
1518#
1519#BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1520#
1521# r3 = r
1522# r4 = a
1523# r5 = b
1524# r6 = n
1525#
1526# Note: No loop unrolling done since this is not a performance
1527# critical loop.
1528
1529 xor r0,r0,r0 #set r0 = 0
1530#
1531# check for r6 = 0 AND set carry bit.
1532#
1533 subfc. r7,r0,r6 # If r6 is 0 then result is 0.
1534 # if r6 > 0 then result !=0
1535 # In either case carry bit is set.
1536 beq Lppcasm_sub_adios
1537 addi r4,r4,-$BNSZ
1538 addi r3,r3,-$BNSZ
1539 addi r5,r5,-$BNSZ
1540 mtctr r6
1541Lppcasm_sub_mainloop:
1542 $LDU r7,$BNSZ(r4)
1543 $LDU r8,$BNSZ(r5)
1544 subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8)
1545 # if carry = 1 this is r7-r8. Else it
1546 # is r7-r8 -1 as we need.
1547 $STU r6,$BNSZ(r3)
1548 bdnz- Lppcasm_sub_mainloop
1549Lppcasm_sub_adios:
1550 subfze r3,r0 # if carry bit is set then r3 = 0 else -1
1551 andi. r3,r3,1 # keep only last bit.
1552 blr
1553 .long 0x00000000
1554
1555
1556#
1557# NOTE: The following label name should be changed to
1558# "bn_add_words" i.e. remove the first dot
1559# for the gcc compiler. This should be automatically
1560# done in the build
1561#
1562
1563.align 4
1564.bn_add_words:
1565#
1566# Handcoded version of bn_add_words
1567#
1568#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
1569#
1570# r3 = r
1571# r4 = a
1572# r5 = b
1573# r6 = n
1574#
1575# Note: No loop unrolling done since this is not a performance
1576# critical loop.
1577
1578 xor r0,r0,r0
1579#
1580# check for r6 = 0. Is this needed?
1581#
1582 addic. r6,r6,0 #test r6 and clear carry bit.
1583 beq Lppcasm_add_adios
1584 addi r4,r4,-$BNSZ
1585 addi r3,r3,-$BNSZ
1586 addi r5,r5,-$BNSZ
1587 mtctr r6
1588Lppcasm_add_mainloop:
1589 $LDU r7,$BNSZ(r4)
1590 $LDU r8,$BNSZ(r5)
1591 adde r8,r7,r8
1592 $STU r8,$BNSZ(r3)
1593 bdnz- Lppcasm_add_mainloop
1594Lppcasm_add_adios:
1595 addze r3,r0 #return carry bit.
1596 blr
1597 .long 0x00000000
1598
1599#
1600# NOTE: The following label name should be changed to
1601# "bn_div_words" i.e. remove the first dot
1602# for the gcc compiler. This should be automatically
1603# done in the build
1604#
1605
1606.align 4
1607.bn_div_words:
1608#
1609# This is a cleaned up version of code generated by
1610# the AIX compiler. The only optimization is to use
1611# the PPC instruction to count leading zeros instead
1612# of call to num_bits_word. Since this was compiled
1613# only at level -O2 we can possibly squeeze it more?
1614#
1615# r3 = h
1616# r4 = l
1617# r5 = d
1618
1619 $UCMPI 0,r5,0 # compare r5 and 0
1620 bne Lppcasm_div1 # proceed if d!=0
1621 li r3,-1 # d=0 return -1
1622 blr
1623Lppcasm_div1:
1624 xor r0,r0,r0 #r0=0
1625 li r8,$BITS
1626 $CNTLZ. r7,r5 #r7 = num leading 0s in d.
1627 beq Lppcasm_div2 #proceed if no leading zeros
1628 subf r8,r7,r8 #r8 = BN_num_bits_word(d)
1629 $SHR. r9,r3,r8 #are there any bits above r8'th?
1630 $TR 16,r9,r0 #if there're, signal to dump core...
1631Lppcasm_div2:
1632 $UCMP 0,r3,r5 #h>=d?
1633 blt Lppcasm_div3 #goto Lppcasm_div3 if not
1634 subf r3,r5,r3 #h-=d ;
1635Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i
1636 cmpi 0,0,r7,0 # is (i == 0)?
1637 beq Lppcasm_div4
1638 $SHL r3,r3,r7 # h = (h<< i)
1639 $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i)
1640 $SHL r5,r5,r7 # d<<=i
1641 or r3,r3,r8 # h = (h<<i)|(l>>(BN_BITS2-i))
1642 $SHL r4,r4,r7 # l <<=i
1643Lppcasm_div4:
1644 $SHRI r9,r5,`$BITS/2` # r9 = dh
1645 # dl will be computed when needed
1646 # as it saves registers.
1647 li r6,2 #r6=2
1648 mtctr r6 #counter will be in count.
1649Lppcasm_divouterloop:
1650 $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4)
1651 $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4
1652 # compute here for innerloop.
1653 $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh
1654 bne Lppcasm_div5 # goto Lppcasm_div5 if not
1655
1656 li r8,-1
1657 $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l
1658 b Lppcasm_div6
1659Lppcasm_div5:
1660 $UDIV r8,r3,r9 #q = h/dh
1661Lppcasm_div6:
1662 $UMULL r12,r9,r8 #th = q*dh
1663 $CLRU r10,r5,`$BITS/2` #r10=dl
1664 $UMULL r6,r8,r10 #tl = q*dl
1665
1666Lppcasm_divinnerloop:
1667 subf r10,r12,r3 #t = h -th
1668 $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of...
1669 addic. r7,r7,0 #test if r7 == 0. used below.
1670 # now want to compute
1671 # r7 = (t<<BN_BITS4)|((l&BN_MASK2h)>>BN_BITS4)
1672 # the following 2 instructions do that
1673 $SHLI r7,r10,`$BITS/2` # r7 = (t<<BN_BITS4)
1674 or r7,r7,r11 # r7|=((l&BN_MASK2h)>>BN_BITS4)
1675 $UCMP cr1,r6,r7 # compare (tl <= r7)
1676 bne Lppcasm_divinnerexit
1677 ble cr1,Lppcasm_divinnerexit
1678 addi r8,r8,-1 #q--
1679 subf r12,r9,r12 #th -=dh
1680 $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop.
1681 subf r6,r10,r6 #tl -=dl
1682 b Lppcasm_divinnerloop
1683Lppcasm_divinnerexit:
1684 $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4)
1685 $SHLI r11,r6,`$BITS/2` #tl=(tl<<BN_BITS4)&BN_MASK2h;
1686 $UCMP cr1,r4,r11 # compare l and tl
1687 add r12,r12,r10 # th+=t
1688 bge cr1,Lppcasm_div7 # if (l>=tl) goto Lppcasm_div7
1689 addi r12,r12,1 # th++
1690Lppcasm_div7:
1691 subf r11,r11,r4 #r11=l-tl
1692 $UCMP cr1,r3,r12 #compare h and th
1693 bge cr1,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8
1694 addi r8,r8,-1 # q--
1695 add r3,r5,r3 # h+=d
1696Lppcasm_div8:
1697 subf r12,r12,r3 #r12 = h-th
1698 $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<<BN_BITS4
1699 # want to compute
1700 # h = ((h<<BN_BITS4)|(l>>BN_BITS4))&BN_MASK2
1701 # the following 2 instructions will do this.
1702 $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2.
1703 $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3
1704 bdz Lppcasm_div9 #if (count==0) break ;
1705 $SHLI r0,r8,`$BITS/2` #ret =q<<BN_BITS4
1706 b Lppcasm_divouterloop
1707Lppcasm_div9:
1708 or r3,r8,r0
1709 blr
1710 .long 0x00000000
1711
1712#
1713# NOTE: The following label name should be changed to
1714# "bn_sqr_words" i.e. remove the first dot
1715# for the gcc compiler. This should be automatically
1716# done in the build
1717#
1718.align 4
1719.bn_sqr_words:
1720#
1721# Optimized version of bn_sqr_words
1722#
1723# void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
1724#
1725# r3 = r
1726# r4 = a
1727# r5 = n
1728#
1729# r6 = a[i].
1730# r7,r8 = product.
1731#
1732# No unrolling done here. Not performance critical.
1733
1734 addic. r5,r5,0 #test r5.
1735 beq Lppcasm_sqr_adios
1736 addi r4,r4,-$BNSZ
1737 addi r3,r3,-$BNSZ
1738 mtctr r5
1739Lppcasm_sqr_mainloop:
1740 #sqr(r[0],r[1],a[0]);
1741 $LDU r6,$BNSZ(r4)
1742 $UMULL r7,r6,r6
1743 $UMULH r8,r6,r6
1744 $STU r7,$BNSZ(r3)
1745 $STU r8,$BNSZ(r3)
1746 bdnz- Lppcasm_sqr_mainloop
1747Lppcasm_sqr_adios:
1748 blr
1749 .long 0x00000000
1750
1751
1752#
1753# NOTE: The following label name should be changed to
1754# "bn_mul_words" i.e. remove the first dot
1755# for the gcc compiler. This should be automatically
1756# done in the build
1757#
1758
1759.align 4
1760.bn_mul_words:
1761#
1762# BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1763#
1764# r3 = rp
1765# r4 = ap
1766# r5 = num
1767# r6 = w
1768 xor r0,r0,r0
1769 xor r12,r12,r12 # used for carry
1770 rlwinm. r7,r5,30,2,31 # num >> 2
1771 beq Lppcasm_mw_REM
1772 mtctr r7
1773Lppcasm_mw_LOOP:
1774 #mul(rp[0],ap[0],w,c1);
1775 $LD r8,`0*$BNSZ`(r4)
1776 $UMULL r9,r6,r8
1777 $UMULH r10,r6,r8
1778 addc r9,r9,r12
1779 #addze r10,r10 #carry is NOT ignored.
1780 #will be taken care of
1781 #in second spin below
1782 #using adde.
1783 $ST r9,`0*$BNSZ`(r3)
1784 #mul(rp[1],ap[1],w,c1);
1785 $LD r8,`1*$BNSZ`(r4)
1786 $UMULL r11,r6,r8
1787 $UMULH r12,r6,r8
1788 adde r11,r11,r10
1789 #addze r12,r12
1790 $ST r11,`1*$BNSZ`(r3)
1791 #mul(rp[2],ap[2],w,c1);
1792 $LD r8,`2*$BNSZ`(r4)
1793 $UMULL r9,r6,r8
1794 $UMULH r10,r6,r8
1795 adde r9,r9,r12
1796 #addze r10,r10
1797 $ST r9,`2*$BNSZ`(r3)
1798 #mul_add(rp[3],ap[3],w,c1);
1799 $LD r8,`3*$BNSZ`(r4)
1800 $UMULL r11,r6,r8
1801 $UMULH r12,r6,r8
1802 adde r11,r11,r10
1803 addze r12,r12 #this spin we collect carry into
1804 #r12
1805 $ST r11,`3*$BNSZ`(r3)
1806
1807 addi r3,r3,`4*$BNSZ`
1808 addi r4,r4,`4*$BNSZ`
1809 bdnz- Lppcasm_mw_LOOP
1810
1811Lppcasm_mw_REM:
1812 andi. r5,r5,0x3
1813 beq Lppcasm_mw_OVER
1814 #mul(rp[0],ap[0],w,c1);
1815 $LD r8,`0*$BNSZ`(r4)
1816 $UMULL r9,r6,r8
1817 $UMULH r10,r6,r8
1818 addc r9,r9,r12
1819 addze r10,r10
1820 $ST r9,`0*$BNSZ`(r3)
1821 addi r12,r10,0
1822
1823 addi r5,r5,-1
1824 cmpli 0,0,r5,0
1825 beq Lppcasm_mw_OVER
1826
1827
1828 #mul(rp[1],ap[1],w,c1);
1829 $LD r8,`1*$BNSZ`(r4)
1830 $UMULL r9,r6,r8
1831 $UMULH r10,r6,r8
1832 addc r9,r9,r12
1833 addze r10,r10
1834 $ST r9,`1*$BNSZ`(r3)
1835 addi r12,r10,0
1836
1837 addi r5,r5,-1
1838 cmpli 0,0,r5,0
1839 beq Lppcasm_mw_OVER
1840
1841 #mul_add(rp[2],ap[2],w,c1);
1842 $LD r8,`2*$BNSZ`(r4)
1843 $UMULL r9,r6,r8
1844 $UMULH r10,r6,r8
1845 addc r9,r9,r12
1846 addze r10,r10
1847 $ST r9,`2*$BNSZ`(r3)
1848 addi r12,r10,0
1849
1850Lppcasm_mw_OVER:
1851 addi r3,r12,0
1852 blr
1853 .long 0x00000000
1854
1855#
1856# NOTE: The following label name should be changed to
1857# "bn_mul_add_words" i.e. remove the first dot
1858# for the gcc compiler. This should be automatically
1859# done in the build
1860#
1861
1862.align 4
1863.bn_mul_add_words:
1864#
1865# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
1866#
1867# r3 = rp
1868# r4 = ap
1869# r5 = num
1870# r6 = w
1871#
1872# empirical evidence suggests that unrolled version performs best!!
1873#
1874 xor r0,r0,r0 #r0 = 0
1875 xor r12,r12,r12 #r12 = 0 . used for carry
1876 rlwinm. r7,r5,30,2,31 # num >> 2
1877 beq Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover
1878 mtctr r7
1879Lppcasm_maw_mainloop:
1880 #mul_add(rp[0],ap[0],w,c1);
1881 $LD r8,`0*$BNSZ`(r4)
1882 $LD r11,`0*$BNSZ`(r3)
1883 $UMULL r9,r6,r8
1884 $UMULH r10,r6,r8
1885 addc r9,r9,r12 #r12 is carry.
1886 addze r10,r10
1887 addc r9,r9,r11
1888 #addze r10,r10
1889 #the above instruction addze
1890 #is NOT needed. Carry will NOT
1891 #be ignored. It's not affected
1892 #by multiply and will be collected
1893 #in the next spin
1894 $ST r9,`0*$BNSZ`(r3)
1895
1896 #mul_add(rp[1],ap[1],w,c1);
1897 $LD r8,`1*$BNSZ`(r4)
1898 $LD r9,`1*$BNSZ`(r3)
1899 $UMULL r11,r6,r8
1900 $UMULH r12,r6,r8
1901 adde r11,r11,r10 #r10 is carry.
1902 addze r12,r12
1903 addc r11,r11,r9
1904 #addze r12,r12
1905 $ST r11,`1*$BNSZ`(r3)
1906
1907 #mul_add(rp[2],ap[2],w,c1);
1908 $LD r8,`2*$BNSZ`(r4)
1909 $UMULL r9,r6,r8
1910 $LD r11,`2*$BNSZ`(r3)
1911 $UMULH r10,r6,r8
1912 adde r9,r9,r12
1913 addze r10,r10
1914 addc r9,r9,r11
1915 #addze r10,r10
1916 $ST r9,`2*$BNSZ`(r3)
1917
1918 #mul_add(rp[3],ap[3],w,c1);
1919 $LD r8,`3*$BNSZ`(r4)
1920 $UMULL r11,r6,r8
1921 $LD r9,`3*$BNSZ`(r3)
1922 $UMULH r12,r6,r8
1923 adde r11,r11,r10
1924 addze r12,r12
1925 addc r11,r11,r9
1926 addze r12,r12
1927 $ST r11,`3*$BNSZ`(r3)
1928 addi r3,r3,`4*$BNSZ`
1929 addi r4,r4,`4*$BNSZ`
1930 bdnz- Lppcasm_maw_mainloop
1931
1932Lppcasm_maw_leftover:
1933 andi. r5,r5,0x3
1934 beq Lppcasm_maw_adios
1935 addi r3,r3,-$BNSZ
1936 addi r4,r4,-$BNSZ
1937 #mul_add(rp[0],ap[0],w,c1);
1938 mtctr r5
1939 $LDU r8,$BNSZ(r4)
1940 $UMULL r9,r6,r8
1941 $UMULH r10,r6,r8
1942 $LDU r11,$BNSZ(r3)
1943 addc r9,r9,r11
1944 addze r10,r10
1945 addc r9,r9,r12
1946 addze r12,r10
1947 $ST r9,0(r3)
1948
1949 bdz Lppcasm_maw_adios
1950 #mul_add(rp[1],ap[1],w,c1);
1951 $LDU r8,$BNSZ(r4)
1952 $UMULL r9,r6,r8
1953 $UMULH r10,r6,r8
1954 $LDU r11,$BNSZ(r3)
1955 addc r9,r9,r11
1956 addze r10,r10
1957 addc r9,r9,r12
1958 addze r12,r10
1959 $ST r9,0(r3)
1960
1961 bdz Lppcasm_maw_adios
1962 #mul_add(rp[2],ap[2],w,c1);
1963 $LDU r8,$BNSZ(r4)
1964 $UMULL r9,r6,r8
1965 $UMULH r10,r6,r8
1966 $LDU r11,$BNSZ(r3)
1967 addc r9,r9,r11
1968 addze r10,r10
1969 addc r9,r9,r12
1970 addze r12,r10
1971 $ST r9,0(r3)
1972
1973Lppcasm_maw_adios:
1974 addi r3,r12,0
1975 blr
1976 .long 0x00000000
1977 .align 4
1978EOF
1979$data =~ s/\`([^\`]*)\`/eval $1/gem;
1980print $data;
1981close STDOUT;