blob: dfb8b37b0d510868194b06b22fdfb43f7ebb1847 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20# August 2013.
21#
22# Add MULX/AD*X code paths and additional interfaces to optimize for
23# branch prediction unit. For input lengths that are multiples of 8
24# the np argument is not just modulus value, but one interleaved
25# with 0. This is to optimize post-condition...
26
27$flavour = shift;
28$output = shift;
29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan8ff03552017-06-14 12:40:58 -070035( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -080036die "can't locate x86_64-xlate.pl";
37
David Benjaminc895d6b2016-08-11 13:26:41 -040038open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -080039*STDOUT=*OUT;
40
Kenny Roote99801b2015-11-06 15:31:15 -080041# In upstream, this is controlled by shelling out to the compiler to check
42# versions, but BoringSSL is intended to be used with pre-generated perlasm
43# output, so this isn't useful anyway.
44#
Robert Sloan8f860b12017-08-28 07:37:06 -070045# TODO(davidben): Set $addx to one once build problems are resolved.
Kenny Roote99801b2015-11-06 15:31:15 -080046$addx = 0;
Adam Langleyd9e397b2015-01-22 14:27:53 -080047
48# int bn_mul_mont_gather5(
49$rp="%rdi"; # BN_ULONG *rp,
50$ap="%rsi"; # const BN_ULONG *ap,
51$bp="%rdx"; # const BN_ULONG *bp,
52$np="%rcx"; # const BN_ULONG *np,
53$n0="%r8"; # const BN_ULONG *n0,
54$num="%r9"; # int num,
55 # int idx); # 0 to 2^5-1, "index" in $bp holding
56 # pre-computed powers of a', interlaced
57 # in such manner that b[0] is $bp[idx],
58 # b[1] is [2^5+idx], etc.
59$lo0="%r10";
60$hi0="%r11";
61$hi1="%r13";
62$i="%r14";
63$j="%r15";
64$m0="%rbx";
65$m1="%rbp";
66
67$code=<<___;
68.text
69
70.extern OPENSSL_ia32cap_P
71
72.globl bn_mul_mont_gather5
73.type bn_mul_mont_gather5,\@function,6
74.align 64
75bn_mul_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -080076.cfi_startproc
77 mov ${num}d,${num}d
78 mov %rsp,%rax
79.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -080080 test \$7,${num}d
81 jnz .Lmul_enter
82___
83$code.=<<___ if ($addx);
Robert Sloan8ff03552017-06-14 12:40:58 -070084 leaq OPENSSL_ia32cap_P(%rip),%r11
85 mov 8(%r11),%r11d
Adam Langleyd9e397b2015-01-22 14:27:53 -080086___
87$code.=<<___;
88 jmp .Lmul4x_enter
89
90.align 16
91.Lmul_enter:
David Benjamin4969cc92016-04-22 15:02:23 -040092 movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
Adam Langleyd9e397b2015-01-22 14:27:53 -080093 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -080094.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -080095 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -080096.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -080097 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -080098.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -080099 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800100.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800101 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800102.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800103 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800104.cfi_push %r15
David Benjamin4969cc92016-04-22 15:02:23 -0400105
Robert Sloana94fe052017-02-21 08:49:28 -0800106 neg $num
107 mov %rsp,%r11
108 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8)
109 neg $num # restore $num
110 and \$-1024,%r10 # minimize TLB usage
Adam Langleyd9e397b2015-01-22 14:27:53 -0800111
Robert Sloana94fe052017-02-21 08:49:28 -0800112 # An OS-agnostic version of __chkstk.
113 #
114 # Some OSes (Windows) insist on stack being "wired" to
115 # physical memory in strictly sequential manner, i.e. if stack
116 # allocation spans two pages, then reference to farmost one can
117 # be punishable by SEGV. But page walking can do good even on
118 # other OSes, because it guarantees that villain thread hits
119 # the guard page before it can make damage to innocent one...
120 sub %r10,%r11
121 and \$-4096,%r11
122 lea (%r10,%r11),%rsp
123 mov (%rsp),%r11
124 cmp %r10,%rsp
125 ja .Lmul_page_walk
126 jmp .Lmul_page_walk_done
127
128.Lmul_page_walk:
129 lea -4096(%rsp),%rsp
130 mov (%rsp),%r11
131 cmp %r10,%rsp
132 ja .Lmul_page_walk
133.Lmul_page_walk_done:
134
135 lea .Linc(%rip),%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800136 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800137.cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800138.Lmul_body:
Robert Sloana94fe052017-02-21 08:49:28 -0800139
David Benjamin4969cc92016-04-22 15:02:23 -0400140 lea 128($bp),%r12 # reassign $bp (+size optimization)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800141___
142 $bp="%r12";
143 $STRIDE=2**5*8; # 5 is "window size"
144 $N=$STRIDE/4; # should match cache line size
145$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -0400146 movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
147 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
148 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
149 and \$-16,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -0800150
David Benjamin4969cc92016-04-22 15:02:23 -0400151 pshufd \$0,%xmm5,%xmm5 # broadcast index
152 movdqa %xmm1,%xmm4
153 movdqa %xmm1,%xmm2
154___
155########################################################################
156# calculate mask by comparing 0..31 to index and save result to stack
157#
158$code.=<<___;
159 paddd %xmm0,%xmm1
160 pcmpeqd %xmm5,%xmm0 # compare to 1,0
161 .byte 0x67
162 movdqa %xmm4,%xmm3
163___
164for($k=0;$k<$STRIDE/16-4;$k+=4) {
165$code.=<<___;
166 paddd %xmm1,%xmm2
167 pcmpeqd %xmm5,%xmm1 # compare to 3,2
168 movdqa %xmm0,`16*($k+0)+112`(%r10)
169 movdqa %xmm4,%xmm0
170
171 paddd %xmm2,%xmm3
172 pcmpeqd %xmm5,%xmm2 # compare to 5,4
173 movdqa %xmm1,`16*($k+1)+112`(%r10)
174 movdqa %xmm4,%xmm1
175
176 paddd %xmm3,%xmm0
177 pcmpeqd %xmm5,%xmm3 # compare to 7,6
178 movdqa %xmm2,`16*($k+2)+112`(%r10)
179 movdqa %xmm4,%xmm2
180
181 paddd %xmm0,%xmm1
182 pcmpeqd %xmm5,%xmm0
183 movdqa %xmm3,`16*($k+3)+112`(%r10)
184 movdqa %xmm4,%xmm3
185___
186}
187$code.=<<___; # last iteration can be optimized
188 paddd %xmm1,%xmm2
189 pcmpeqd %xmm5,%xmm1
190 movdqa %xmm0,`16*($k+0)+112`(%r10)
191
192 paddd %xmm2,%xmm3
193 .byte 0x67
194 pcmpeqd %xmm5,%xmm2
195 movdqa %xmm1,`16*($k+1)+112`(%r10)
196
197 pcmpeqd %xmm5,%xmm3
198 movdqa %xmm2,`16*($k+2)+112`(%r10)
199 pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
200
201 pand `16*($k+1)-128`($bp),%xmm1
202 pand `16*($k+2)-128`($bp),%xmm2
203 movdqa %xmm3,`16*($k+3)+112`(%r10)
204 pand `16*($k+3)-128`($bp),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800205 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -0400206 por %xmm3,%xmm1
207___
208for($k=0;$k<$STRIDE/16-4;$k+=4) {
209$code.=<<___;
210 movdqa `16*($k+0)-128`($bp),%xmm4
211 movdqa `16*($k+1)-128`($bp),%xmm5
212 movdqa `16*($k+2)-128`($bp),%xmm2
213 pand `16*($k+0)+112`(%r10),%xmm4
214 movdqa `16*($k+3)-128`($bp),%xmm3
215 pand `16*($k+1)+112`(%r10),%xmm5
216 por %xmm4,%xmm0
217 pand `16*($k+2)+112`(%r10),%xmm2
218 por %xmm5,%xmm1
219 pand `16*($k+3)+112`(%r10),%xmm3
220 por %xmm2,%xmm0
221 por %xmm3,%xmm1
222___
223}
224$code.=<<___;
225 por %xmm1,%xmm0
226 pshufd \$0x4e,%xmm0,%xmm1
227 por %xmm1,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800228 lea $STRIDE($bp),$bp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800229 movq %xmm0,$m0 # m0=bp[0]
230
231 mov ($n0),$n0 # pull n0[0] value
232 mov ($ap),%rax
233
234 xor $i,$i # i=0
235 xor $j,$j # j=0
236
Adam Langleyd9e397b2015-01-22 14:27:53 -0800237 mov $n0,$m1
238 mulq $m0 # ap[0]*bp[0]
239 mov %rax,$lo0
240 mov ($np),%rax
241
Adam Langleyd9e397b2015-01-22 14:27:53 -0800242 imulq $lo0,$m1 # "tp[0]"*n0
243 mov %rdx,$hi0
244
Adam Langleyd9e397b2015-01-22 14:27:53 -0800245 mulq $m1 # np[0]*m1
246 add %rax,$lo0 # discarded
247 mov 8($ap),%rax
248 adc \$0,%rdx
249 mov %rdx,$hi1
250
251 lea 1($j),$j # j++
252 jmp .L1st_enter
253
254.align 16
255.L1st:
256 add %rax,$hi1
257 mov ($ap,$j,8),%rax
258 adc \$0,%rdx
259 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
260 mov $lo0,$hi0
261 adc \$0,%rdx
262 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
263 mov %rdx,$hi1
264
265.L1st_enter:
266 mulq $m0 # ap[j]*bp[0]
267 add %rax,$hi0
268 mov ($np,$j,8),%rax
269 adc \$0,%rdx
270 lea 1($j),$j # j++
271 mov %rdx,$lo0
272
273 mulq $m1 # np[j]*m1
274 cmp $num,$j
David Benjamin4969cc92016-04-22 15:02:23 -0400275 jne .L1st # note that upon exit $j==$num, so
276 # they can be used interchangeably
Adam Langleyd9e397b2015-01-22 14:27:53 -0800277
278 add %rax,$hi1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800279 adc \$0,%rdx
280 add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
281 adc \$0,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -0400282 mov $hi1,-16(%rsp,$num,8) # tp[num-1]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800283 mov %rdx,$hi1
284 mov $lo0,$hi0
285
286 xor %rdx,%rdx
287 add $hi0,$hi1
288 adc \$0,%rdx
289 mov $hi1,-8(%rsp,$num,8)
290 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
291
292 lea 1($i),$i # i++
293 jmp .Louter
294.align 16
295.Louter:
David Benjamin4969cc92016-04-22 15:02:23 -0400296 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
297 and \$-16,%rdx
298 pxor %xmm4,%xmm4
299 pxor %xmm5,%xmm5
300___
301for($k=0;$k<$STRIDE/16;$k+=4) {
302$code.=<<___;
303 movdqa `16*($k+0)-128`($bp),%xmm0
304 movdqa `16*($k+1)-128`($bp),%xmm1
305 movdqa `16*($k+2)-128`($bp),%xmm2
306 movdqa `16*($k+3)-128`($bp),%xmm3
307 pand `16*($k+0)-128`(%rdx),%xmm0
308 pand `16*($k+1)-128`(%rdx),%xmm1
309 por %xmm0,%xmm4
310 pand `16*($k+2)-128`(%rdx),%xmm2
311 por %xmm1,%xmm5
312 pand `16*($k+3)-128`(%rdx),%xmm3
313 por %xmm2,%xmm4
314 por %xmm3,%xmm5
315___
316}
317$code.=<<___;
318 por %xmm5,%xmm4
319 pshufd \$0x4e,%xmm4,%xmm0
320 por %xmm4,%xmm0
321 lea $STRIDE($bp),$bp
322
323 mov ($ap),%rax # ap[0]
324 movq %xmm0,$m0 # m0=bp[i]
325
Adam Langleyd9e397b2015-01-22 14:27:53 -0800326 xor $j,$j # j=0
327 mov $n0,$m1
328 mov (%rsp),$lo0
329
Adam Langleyd9e397b2015-01-22 14:27:53 -0800330 mulq $m0 # ap[0]*bp[i]
331 add %rax,$lo0 # ap[0]*bp[i]+tp[0]
332 mov ($np),%rax
333 adc \$0,%rdx
334
Adam Langleyd9e397b2015-01-22 14:27:53 -0800335 imulq $lo0,$m1 # tp[0]*n0
336 mov %rdx,$hi0
337
Adam Langleyd9e397b2015-01-22 14:27:53 -0800338 mulq $m1 # np[0]*m1
339 add %rax,$lo0 # discarded
340 mov 8($ap),%rax
341 adc \$0,%rdx
342 mov 8(%rsp),$lo0 # tp[1]
343 mov %rdx,$hi1
344
345 lea 1($j),$j # j++
346 jmp .Linner_enter
347
348.align 16
349.Linner:
350 add %rax,$hi1
351 mov ($ap,$j,8),%rax
352 adc \$0,%rdx
353 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
354 mov (%rsp,$j,8),$lo0
355 adc \$0,%rdx
356 mov $hi1,-16(%rsp,$j,8) # tp[j-1]
357 mov %rdx,$hi1
358
359.Linner_enter:
360 mulq $m0 # ap[j]*bp[i]
361 add %rax,$hi0
362 mov ($np,$j,8),%rax
363 adc \$0,%rdx
364 add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
365 mov %rdx,$hi0
366 adc \$0,$hi0
367 lea 1($j),$j # j++
368
369 mulq $m1 # np[j]*m1
370 cmp $num,$j
David Benjamin4969cc92016-04-22 15:02:23 -0400371 jne .Linner # note that upon exit $j==$num, so
372 # they can be used interchangeably
Adam Langleyd9e397b2015-01-22 14:27:53 -0800373 add %rax,$hi1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800374 adc \$0,%rdx
375 add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
David Benjamin4969cc92016-04-22 15:02:23 -0400376 mov (%rsp,$num,8),$lo0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800377 adc \$0,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -0400378 mov $hi1,-16(%rsp,$num,8) # tp[num-1]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800379 mov %rdx,$hi1
380
381 xor %rdx,%rdx
382 add $hi0,$hi1
383 adc \$0,%rdx
384 add $lo0,$hi1 # pull upmost overflow bit
385 adc \$0,%rdx
386 mov $hi1,-8(%rsp,$num,8)
387 mov %rdx,(%rsp,$num,8) # store upmost overflow bit
388
389 lea 1($i),$i # i++
390 cmp $num,$i
391 jb .Louter
392
393 xor $i,$i # i=0 and clear CF!
394 mov (%rsp),%rax # tp[0]
395 lea (%rsp),$ap # borrow ap for tp
396 mov $num,$j # j=num
397 jmp .Lsub
398.align 16
Robert Sloan8ff03552017-06-14 12:40:58 -0700399.Lsub:
400 sbb ($np,$i,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800401 mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
402 mov 8($ap,$i,8),%rax # tp[i+1]
403 lea 1($i),$i # i++
404 dec $j # doesnn't affect CF!
405 jnz .Lsub
406
407 sbb \$0,%rax # handle upmost overflow bit
408 xor $i,$i
Robert Sloana94fe052017-02-21 08:49:28 -0800409 and %rax,$ap
410 not %rax
411 mov $rp,$np
412 and %rax,$np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800413 mov $num,$j # j=num
Robert Sloana94fe052017-02-21 08:49:28 -0800414 or $np,$ap # ap=borrow?tp:rp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800415.align 16
416.Lcopy: # copy or in-place refresh
Robert Sloana94fe052017-02-21 08:49:28 -0800417 mov ($ap,$i,8),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800418 mov $i,(%rsp,$i,8) # zap temporary vector
Robert Sloana94fe052017-02-21 08:49:28 -0800419 mov %rax,($rp,$i,8) # rp[i]=tp[i]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800420 lea 1($i),$i
421 sub \$1,$j
422 jnz .Lcopy
423
424 mov 8(%rsp,$num,8),%rsi # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800425.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800426 mov \$1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400427
Adam Langleyd9e397b2015-01-22 14:27:53 -0800428 mov -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800429.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800430 mov -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800431.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800432 mov -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800433.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800434 mov -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800435.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800436 mov -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800437.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800438 mov -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800439.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800440 lea (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800441.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800442.Lmul_epilogue:
443 ret
Robert Sloana94fe052017-02-21 08:49:28 -0800444.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800445.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
446___
447{{{
448my @A=("%r10","%r11");
449my @N=("%r13","%rdi");
450$code.=<<___;
451.type bn_mul4x_mont_gather5,\@function,6
452.align 32
453bn_mul4x_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -0800454.cfi_startproc
455 .byte 0x67
456 mov %rsp,%rax
457.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800458.Lmul4x_enter:
459___
460$code.=<<___ if ($addx);
David Benjamin4969cc92016-04-22 15:02:23 -0400461 and \$0x80108,%r11d
462 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800463 je .Lmulx4x_enter
464___
465$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -0800466 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800467.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800468 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800469.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800470 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800471.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800472 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800473.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800474 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800475.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800476 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800477.cfi_push %r15
478.Lmul4x_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -0400479
Adam Langleyd9e397b2015-01-22 14:27:53 -0800480 .byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400481 shl \$3,${num}d # convert $num to bytes
482 lea ($num,$num,2),%r10 # 3*$num in bytes
Adam Langleyd9e397b2015-01-22 14:27:53 -0800483 neg $num # -$num
484
485 ##############################################################
David Benjamin4969cc92016-04-22 15:02:23 -0400486 # Ensure that stack frame doesn't alias with $rptr+3*$num
487 # modulo 4096, which covers ret[num], am[num] and n[num]
488 # (see bn_exp.c). This is done to allow memory disambiguation
489 # logic do its magic. [Extra [num] is allocated in order
490 # to align with bn_power5's frame, which is cleansed after
491 # completing exponentiation. Extra 256 bytes is for power mask
492 # calculated from 7th argument, the index.]
Adam Langleyd9e397b2015-01-22 14:27:53 -0800493 #
David Benjamin4969cc92016-04-22 15:02:23 -0400494 lea -320(%rsp,$num,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800495 mov %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -0400496 sub $rp,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -0800497 and \$4095,%r11
498 cmp %r11,%r10
499 jb .Lmul4xsp_alt
Robert Sloana94fe052017-02-21 08:49:28 -0800500 sub %r11,%rbp # align with $rp
501 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800502 jmp .Lmul4xsp_done
503
504.align 32
505.Lmul4xsp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -0400506 lea 4096-320(,$num,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -0800507 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800508 sub %r10,%r11
509 mov \$0,%r10
510 cmovc %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -0800511 sub %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800512.Lmul4xsp_done:
Robert Sloana94fe052017-02-21 08:49:28 -0800513 and \$-64,%rbp
514 mov %rsp,%r11
515 sub %rbp,%r11
516 and \$-4096,%r11
517 lea (%rbp,%r11),%rsp
518 mov (%rsp),%r10
519 cmp %rbp,%rsp
520 ja .Lmul4x_page_walk
521 jmp .Lmul4x_page_walk_done
522
523.Lmul4x_page_walk:
524 lea -4096(%rsp),%rsp
525 mov (%rsp),%r10
526 cmp %rbp,%rsp
527 ja .Lmul4x_page_walk
528.Lmul4x_page_walk_done:
529
Adam Langleyd9e397b2015-01-22 14:27:53 -0800530 neg $num
531
532 mov %rax,40(%rsp)
Robert Sloana94fe052017-02-21 08:49:28 -0800533.cfi_cfa_expression %rsp+40,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800534.Lmul4x_body:
535
536 call mul4x_internal
537
538 mov 40(%rsp),%rsi # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800539.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -0800540 mov \$1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400541
Adam Langleyd9e397b2015-01-22 14:27:53 -0800542 mov -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800543.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800544 mov -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800545.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800546 mov -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800547.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800548 mov -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800549.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800550 mov -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800551.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800552 mov -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800553.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800554 lea (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800555.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800556.Lmul4x_epilogue:
557 ret
Robert Sloana94fe052017-02-21 08:49:28 -0800558.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800559.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
560
561.type mul4x_internal,\@abi-omnipotent
562.align 32
563mul4x_internal:
David Benjamin4969cc92016-04-22 15:02:23 -0400564 shl \$5,$num # $num was in bytes
565 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index
566 lea .Linc(%rip),%rax
567 lea 128(%rdx,$num),%r13 # end of powers table (+size optimization)
Adam Langleyd9e397b2015-01-22 14:27:53 -0800568 shr \$5,$num # restore $num
569___
570 $bp="%r12";
571 $STRIDE=2**5*8; # 5 is "window size"
572 $N=$STRIDE/4; # should match cache line size
573 $tp=$i;
574$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -0400575 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
576 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
577 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization)
578 lea 128(%rdx),$bp # size optimization
Adam Langleyd9e397b2015-01-22 14:27:53 -0800579
David Benjamin4969cc92016-04-22 15:02:23 -0400580 pshufd \$0,%xmm5,%xmm5 # broadcast index
581 movdqa %xmm1,%xmm4
582 .byte 0x67,0x67
583 movdqa %xmm1,%xmm2
584___
585########################################################################
586# calculate mask by comparing 0..31 to index and save result to stack
587#
588$code.=<<___;
589 paddd %xmm0,%xmm1
590 pcmpeqd %xmm5,%xmm0 # compare to 1,0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800591 .byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400592 movdqa %xmm4,%xmm3
593___
594for($i=0;$i<$STRIDE/16-4;$i+=4) {
595$code.=<<___;
596 paddd %xmm1,%xmm2
597 pcmpeqd %xmm5,%xmm1 # compare to 3,2
598 movdqa %xmm0,`16*($i+0)+112`(%r10)
599 movdqa %xmm4,%xmm0
600
601 paddd %xmm2,%xmm3
602 pcmpeqd %xmm5,%xmm2 # compare to 5,4
603 movdqa %xmm1,`16*($i+1)+112`(%r10)
604 movdqa %xmm4,%xmm1
605
606 paddd %xmm3,%xmm0
607 pcmpeqd %xmm5,%xmm3 # compare to 7,6
608 movdqa %xmm2,`16*($i+2)+112`(%r10)
609 movdqa %xmm4,%xmm2
610
611 paddd %xmm0,%xmm1
612 pcmpeqd %xmm5,%xmm0
613 movdqa %xmm3,`16*($i+3)+112`(%r10)
614 movdqa %xmm4,%xmm3
615___
616}
617$code.=<<___; # last iteration can be optimized
618 paddd %xmm1,%xmm2
619 pcmpeqd %xmm5,%xmm1
620 movdqa %xmm0,`16*($i+0)+112`(%r10)
621
622 paddd %xmm2,%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800623 .byte 0x67
David Benjamin4969cc92016-04-22 15:02:23 -0400624 pcmpeqd %xmm5,%xmm2
625 movdqa %xmm1,`16*($i+1)+112`(%r10)
626
627 pcmpeqd %xmm5,%xmm3
628 movdqa %xmm2,`16*($i+2)+112`(%r10)
629 pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register
630
631 pand `16*($i+1)-128`($bp),%xmm1
632 pand `16*($i+2)-128`($bp),%xmm2
633 movdqa %xmm3,`16*($i+3)+112`(%r10)
634 pand `16*($i+3)-128`($bp),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -0800635 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -0400636 por %xmm3,%xmm1
637___
638for($i=0;$i<$STRIDE/16-4;$i+=4) {
639$code.=<<___;
640 movdqa `16*($i+0)-128`($bp),%xmm4
641 movdqa `16*($i+1)-128`($bp),%xmm5
642 movdqa `16*($i+2)-128`($bp),%xmm2
643 pand `16*($i+0)+112`(%r10),%xmm4
644 movdqa `16*($i+3)-128`($bp),%xmm3
645 pand `16*($i+1)+112`(%r10),%xmm5
646 por %xmm4,%xmm0
647 pand `16*($i+2)+112`(%r10),%xmm2
648 por %xmm5,%xmm1
649 pand `16*($i+3)+112`(%r10),%xmm3
650 por %xmm2,%xmm0
651 por %xmm3,%xmm1
652___
653}
654$code.=<<___;
655 por %xmm1,%xmm0
656 pshufd \$0x4e,%xmm0,%xmm1
657 por %xmm1,%xmm0
658 lea $STRIDE($bp),$bp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800659 movq %xmm0,$m0 # m0=bp[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400660
Adam Langleyd9e397b2015-01-22 14:27:53 -0800661 mov %r13,16+8(%rsp) # save end of b[num]
662 mov $rp, 56+8(%rsp) # save $rp
663
664 mov ($n0),$n0 # pull n0[0] value
665 mov ($ap),%rax
666 lea ($ap,$num),$ap # end of a[num]
667 neg $num
668
669 mov $n0,$m1
670 mulq $m0 # ap[0]*bp[0]
671 mov %rax,$A[0]
672 mov ($np),%rax
673
Adam Langleyd9e397b2015-01-22 14:27:53 -0800674 imulq $A[0],$m1 # "tp[0]"*n0
David Benjamin4969cc92016-04-22 15:02:23 -0400675 lea 64+8(%rsp),$tp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800676 mov %rdx,$A[1]
677
Adam Langleyd9e397b2015-01-22 14:27:53 -0800678 mulq $m1 # np[0]*m1
679 add %rax,$A[0] # discarded
680 mov 8($ap,$num),%rax
681 adc \$0,%rdx
682 mov %rdx,$N[1]
683
684 mulq $m0
685 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400686 mov 8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800687 adc \$0,%rdx
688 mov %rdx,$A[0]
689
690 mulq $m1
691 add %rax,$N[1]
692 mov 16($ap,$num),%rax
693 adc \$0,%rdx
694 add $A[1],$N[1]
695 lea 4*8($num),$j # j=4
David Benjamin4969cc92016-04-22 15:02:23 -0400696 lea 8*4($np),$np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800697 adc \$0,%rdx
698 mov $N[1],($tp)
699 mov %rdx,$N[0]
700 jmp .L1st4x
701
702.align 32
703.L1st4x:
704 mulq $m0 # ap[j]*bp[0]
705 add %rax,$A[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400706 mov -8*2($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800707 lea 32($tp),$tp
708 adc \$0,%rdx
709 mov %rdx,$A[1]
710
711 mulq $m1 # np[j]*m1
712 add %rax,$N[0]
713 mov -8($ap,$j),%rax
714 adc \$0,%rdx
715 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
716 adc \$0,%rdx
717 mov $N[0],-24($tp) # tp[j-1]
718 mov %rdx,$N[1]
719
720 mulq $m0 # ap[j]*bp[0]
721 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400722 mov -8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800723 adc \$0,%rdx
724 mov %rdx,$A[0]
725
726 mulq $m1 # np[j]*m1
727 add %rax,$N[1]
728 mov ($ap,$j),%rax
729 adc \$0,%rdx
730 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
731 adc \$0,%rdx
732 mov $N[1],-16($tp) # tp[j-1]
733 mov %rdx,$N[0]
734
735 mulq $m0 # ap[j]*bp[0]
736 add %rax,$A[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400737 mov 8*0($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800738 adc \$0,%rdx
739 mov %rdx,$A[1]
740
741 mulq $m1 # np[j]*m1
742 add %rax,$N[0]
743 mov 8($ap,$j),%rax
744 adc \$0,%rdx
745 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
746 adc \$0,%rdx
747 mov $N[0],-8($tp) # tp[j-1]
748 mov %rdx,$N[1]
749
750 mulq $m0 # ap[j]*bp[0]
751 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400752 mov 8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800753 adc \$0,%rdx
754 mov %rdx,$A[0]
755
756 mulq $m1 # np[j]*m1
757 add %rax,$N[1]
758 mov 16($ap,$j),%rax
759 adc \$0,%rdx
760 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400761 lea 8*4($np),$np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800762 adc \$0,%rdx
763 mov $N[1],($tp) # tp[j-1]
764 mov %rdx,$N[0]
765
766 add \$32,$j # j+=4
767 jnz .L1st4x
768
769 mulq $m0 # ap[j]*bp[0]
770 add %rax,$A[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400771 mov -8*2($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800772 lea 32($tp),$tp
773 adc \$0,%rdx
774 mov %rdx,$A[1]
775
776 mulq $m1 # np[j]*m1
777 add %rax,$N[0]
778 mov -8($ap),%rax
779 adc \$0,%rdx
780 add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
781 adc \$0,%rdx
782 mov $N[0],-24($tp) # tp[j-1]
783 mov %rdx,$N[1]
784
785 mulq $m0 # ap[j]*bp[0]
786 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400787 mov -8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800788 adc \$0,%rdx
789 mov %rdx,$A[0]
790
791 mulq $m1 # np[j]*m1
792 add %rax,$N[1]
793 mov ($ap,$num),%rax # ap[0]
794 adc \$0,%rdx
795 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
796 adc \$0,%rdx
797 mov $N[1],-16($tp) # tp[j-1]
798 mov %rdx,$N[0]
799
David Benjamin4969cc92016-04-22 15:02:23 -0400800 lea ($np,$num),$np # rewind $np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800801
802 xor $N[1],$N[1]
803 add $A[0],$N[0]
804 adc \$0,$N[1]
805 mov $N[0],-8($tp)
806
807 jmp .Louter4x
808
809.align 32
810.Louter4x:
David Benjamin4969cc92016-04-22 15:02:23 -0400811 lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization)
812 pxor %xmm4,%xmm4
813 pxor %xmm5,%xmm5
814___
815for($i=0;$i<$STRIDE/16;$i+=4) {
816$code.=<<___;
817 movdqa `16*($i+0)-128`($bp),%xmm0
818 movdqa `16*($i+1)-128`($bp),%xmm1
819 movdqa `16*($i+2)-128`($bp),%xmm2
820 movdqa `16*($i+3)-128`($bp),%xmm3
821 pand `16*($i+0)-128`(%rdx),%xmm0
822 pand `16*($i+1)-128`(%rdx),%xmm1
823 por %xmm0,%xmm4
824 pand `16*($i+2)-128`(%rdx),%xmm2
825 por %xmm1,%xmm5
826 pand `16*($i+3)-128`(%rdx),%xmm3
827 por %xmm2,%xmm4
828 por %xmm3,%xmm5
829___
830}
831$code.=<<___;
832 por %xmm5,%xmm4
833 pshufd \$0x4e,%xmm4,%xmm0
834 por %xmm4,%xmm0
835 lea $STRIDE($bp),$bp
836 movq %xmm0,$m0 # m0=bp[i]
837
Adam Langleyd9e397b2015-01-22 14:27:53 -0800838 mov ($tp,$num),$A[0]
839 mov $n0,$m1
840 mulq $m0 # ap[0]*bp[i]
841 add %rax,$A[0] # ap[0]*bp[i]+tp[0]
842 mov ($np),%rax
843 adc \$0,%rdx
844
Adam Langleyd9e397b2015-01-22 14:27:53 -0800845 imulq $A[0],$m1 # tp[0]*n0
Adam Langleyd9e397b2015-01-22 14:27:53 -0800846 mov %rdx,$A[1]
847 mov $N[1],($tp) # store upmost overflow bit
848
Adam Langleyd9e397b2015-01-22 14:27:53 -0800849 lea ($tp,$num),$tp # rewind $tp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800850
851 mulq $m1 # np[0]*m1
852 add %rax,$A[0] # "$N[0]", discarded
853 mov 8($ap,$num),%rax
854 adc \$0,%rdx
855 mov %rdx,$N[1]
856
857 mulq $m0 # ap[j]*bp[i]
858 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400859 mov 8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800860 adc \$0,%rdx
861 add 8($tp),$A[1] # +tp[1]
862 adc \$0,%rdx
863 mov %rdx,$A[0]
864
865 mulq $m1 # np[j]*m1
866 add %rax,$N[1]
867 mov 16($ap,$num),%rax
868 adc \$0,%rdx
869 add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
870 lea 4*8($num),$j # j=4
David Benjamin4969cc92016-04-22 15:02:23 -0400871 lea 8*4($np),$np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800872 adc \$0,%rdx
873 mov %rdx,$N[0]
874 jmp .Linner4x
875
876.align 32
877.Linner4x:
878 mulq $m0 # ap[j]*bp[i]
879 add %rax,$A[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400880 mov -8*2($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800881 adc \$0,%rdx
882 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
883 lea 32($tp),$tp
884 adc \$0,%rdx
885 mov %rdx,$A[1]
886
887 mulq $m1 # np[j]*m1
888 add %rax,$N[0]
889 mov -8($ap,$j),%rax
890 adc \$0,%rdx
891 add $A[0],$N[0]
892 adc \$0,%rdx
893 mov $N[1],-32($tp) # tp[j-1]
894 mov %rdx,$N[1]
895
896 mulq $m0 # ap[j]*bp[i]
897 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400898 mov -8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800899 adc \$0,%rdx
900 add -8($tp),$A[1]
901 adc \$0,%rdx
902 mov %rdx,$A[0]
903
904 mulq $m1 # np[j]*m1
905 add %rax,$N[1]
906 mov ($ap,$j),%rax
907 adc \$0,%rdx
908 add $A[1],$N[1]
909 adc \$0,%rdx
910 mov $N[0],-24($tp) # tp[j-1]
911 mov %rdx,$N[0]
912
913 mulq $m0 # ap[j]*bp[i]
914 add %rax,$A[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400915 mov 8*0($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800916 adc \$0,%rdx
917 add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
918 adc \$0,%rdx
919 mov %rdx,$A[1]
920
921 mulq $m1 # np[j]*m1
922 add %rax,$N[0]
923 mov 8($ap,$j),%rax
924 adc \$0,%rdx
925 add $A[0],$N[0]
926 adc \$0,%rdx
927 mov $N[1],-16($tp) # tp[j-1]
928 mov %rdx,$N[1]
929
930 mulq $m0 # ap[j]*bp[i]
931 add %rax,$A[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400932 mov 8*1($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800933 adc \$0,%rdx
934 add 8($tp),$A[1]
935 adc \$0,%rdx
936 mov %rdx,$A[0]
937
938 mulq $m1 # np[j]*m1
939 add %rax,$N[1]
940 mov 16($ap,$j),%rax
941 adc \$0,%rdx
942 add $A[1],$N[1]
David Benjamin4969cc92016-04-22 15:02:23 -0400943 lea 8*4($np),$np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800944 adc \$0,%rdx
945 mov $N[0],-8($tp) # tp[j-1]
946 mov %rdx,$N[0]
947
948 add \$32,$j # j+=4
949 jnz .Linner4x
950
951 mulq $m0 # ap[j]*bp[i]
952 add %rax,$A[0]
David Benjamin4969cc92016-04-22 15:02:23 -0400953 mov -8*2($np),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800954 adc \$0,%rdx
955 add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
956 lea 32($tp),$tp
957 adc \$0,%rdx
958 mov %rdx,$A[1]
959
960 mulq $m1 # np[j]*m1
961 add %rax,$N[0]
962 mov -8($ap),%rax
963 adc \$0,%rdx
964 add $A[0],$N[0]
965 adc \$0,%rdx
966 mov $N[1],-32($tp) # tp[j-1]
967 mov %rdx,$N[1]
968
969 mulq $m0 # ap[j]*bp[i]
970 add %rax,$A[1]
971 mov $m1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -0400972 mov -8*1($np),$m1
Adam Langleyd9e397b2015-01-22 14:27:53 -0800973 adc \$0,%rdx
974 add -8($tp),$A[1]
975 adc \$0,%rdx
976 mov %rdx,$A[0]
977
978 mulq $m1 # np[j]*m1
979 add %rax,$N[1]
980 mov ($ap,$num),%rax # ap[0]
981 adc \$0,%rdx
982 add $A[1],$N[1]
983 adc \$0,%rdx
984 mov $N[0],-24($tp) # tp[j-1]
985 mov %rdx,$N[0]
986
Adam Langleyd9e397b2015-01-22 14:27:53 -0800987 mov $N[1],-16($tp) # tp[j-1]
David Benjamin4969cc92016-04-22 15:02:23 -0400988 lea ($np,$num),$np # rewind $np
Adam Langleyd9e397b2015-01-22 14:27:53 -0800989
990 xor $N[1],$N[1]
991 add $A[0],$N[0]
992 adc \$0,$N[1]
993 add ($tp),$N[0] # pull upmost overflow bit
994 adc \$0,$N[1] # upmost overflow bit
995 mov $N[0],-8($tp)
996
997 cmp 16+8(%rsp),$bp
998 jb .Louter4x
999___
1000if (1) {
1001$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04001002 xor %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001003 sub $N[0],$m1 # compare top-most words
1004 adc $j,$j # $j is zero
1005 or $j,$N[1]
David Benjamin4969cc92016-04-22 15:02:23 -04001006 sub $N[1],%rax # %rax=-$N[1]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001007 lea ($tp,$num),%rbx # tptr in .sqr4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04001008 mov ($np),%r12
1009 lea ($np),%rbp # nptr in .sqr4x_sub
Adam Langleyd9e397b2015-01-22 14:27:53 -08001010 mov %r9,%rcx
David Benjamin4969cc92016-04-22 15:02:23 -04001011 sar \$3+2,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001012 mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04001013 dec %r12 # so that after 'not' we get -n[0]
1014 xor %r10,%r10
1015 mov 8*1(%rbp),%r13
1016 mov 8*2(%rbp),%r14
1017 mov 8*3(%rbp),%r15
1018 jmp .Lsqr4x_sub_entry
Adam Langleyd9e397b2015-01-22 14:27:53 -08001019___
1020} else {
1021my @ri=("%rax",$bp,$m0,$m1);
1022my $rp="%rdx";
1023$code.=<<___
1024 xor \$1,$N[1]
1025 lea ($tp,$num),$tp # rewind $tp
1026 sar \$5,$num # cf=0
1027 lea ($np,$N[1],8),$np
1028 mov 56+8(%rsp),$rp # restore $rp
1029 jmp .Lsub4x
1030
1031.align 32
1032.Lsub4x:
1033 .byte 0x66
1034 mov 8*0($tp),@ri[0]
1035 mov 8*1($tp),@ri[1]
1036 .byte 0x66
1037 sbb 16*0($np),@ri[0]
1038 mov 8*2($tp),@ri[2]
1039 sbb 16*1($np),@ri[1]
1040 mov 3*8($tp),@ri[3]
1041 lea 4*8($tp),$tp
1042 sbb 16*2($np),@ri[2]
1043 mov @ri[0],8*0($rp)
1044 sbb 16*3($np),@ri[3]
1045 lea 16*4($np),$np
1046 mov @ri[1],8*1($rp)
1047 mov @ri[2],8*2($rp)
1048 mov @ri[3],8*3($rp)
1049 lea 8*4($rp),$rp
1050
1051 inc $num
1052 jnz .Lsub4x
1053
1054 ret
1055___
1056}
1057$code.=<<___;
1058.size mul4x_internal,.-mul4x_internal
1059___
1060}}}
1061 {{{
1062######################################################################
1063# void bn_power5(
1064my $rptr="%rdi"; # BN_ULONG *rptr,
1065my $aptr="%rsi"; # const BN_ULONG *aptr,
1066my $bptr="%rdx"; # const void *table,
1067my $nptr="%rcx"; # const BN_ULONG *nptr,
1068my $n0 ="%r8"; # const BN_ULONG *n0);
1069my $num ="%r9"; # int num, has to be divisible by 8
Robert Sloana94fe052017-02-21 08:49:28 -08001070 # int pwr
Adam Langleyd9e397b2015-01-22 14:27:53 -08001071
1072my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1073my @A0=("%r10","%r11");
1074my @A1=("%r12","%r13");
1075my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1076
1077$code.=<<___;
1078.globl bn_power5
1079.type bn_power5,\@function,6
1080.align 32
1081bn_power5:
Robert Sloana94fe052017-02-21 08:49:28 -08001082.cfi_startproc
1083 mov %rsp,%rax
1084.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001085___
1086$code.=<<___ if ($addx);
Robert Sloan8ff03552017-06-14 12:40:58 -07001087 leaq OPENSSL_ia32cap_P(%rip),%r11
1088 mov 8(%r11),%r11d
David Benjamin4969cc92016-04-22 15:02:23 -04001089 and \$0x80108,%r11d
1090 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
Adam Langleyd9e397b2015-01-22 14:27:53 -08001091 je .Lpowerx5_enter
1092___
1093$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08001094 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001095.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001096 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001097.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001098 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08001099.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001100 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08001101.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001102 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08001103.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001104 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -08001105.cfi_push %r15
1106.Lpower5_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04001107
Adam Langleyd9e397b2015-01-22 14:27:53 -08001108 shl \$3,${num}d # convert $num to bytes
David Benjamin4969cc92016-04-22 15:02:23 -04001109 lea ($num,$num,2),%r10d # 3*$num
Adam Langleyd9e397b2015-01-22 14:27:53 -08001110 neg $num
1111 mov ($n0),$n0 # *n0
1112
1113 ##############################################################
David Benjamin4969cc92016-04-22 15:02:23 -04001114 # Ensure that stack frame doesn't alias with $rptr+3*$num
1115 # modulo 4096, which covers ret[num], am[num] and n[num]
1116 # (see bn_exp.c). This is done to allow memory disambiguation
1117 # logic do its magic. [Extra 256 bytes is for power mask
1118 # calculated from 7th argument, the index.]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001119 #
David Benjamin4969cc92016-04-22 15:02:23 -04001120 lea -320(%rsp,$num,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001121 mov %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04001122 sub $rptr,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001123 and \$4095,%r11
1124 cmp %r11,%r10
1125 jb .Lpwr_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08001126 sub %r11,%rbp # align with $aptr
1127 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001128 jmp .Lpwr_sp_done
1129
1130.align 32
1131.Lpwr_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04001132 lea 4096-320(,$num,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08001133 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001134 sub %r10,%r11
1135 mov \$0,%r10
1136 cmovc %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001137 sub %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001138.Lpwr_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08001139 and \$-64,%rbp
1140 mov %rsp,%r11
1141 sub %rbp,%r11
1142 and \$-4096,%r11
1143 lea (%rbp,%r11),%rsp
1144 mov (%rsp),%r10
1145 cmp %rbp,%rsp
1146 ja .Lpwr_page_walk
1147 jmp .Lpwr_page_walk_done
1148
1149.Lpwr_page_walk:
1150 lea -4096(%rsp),%rsp
1151 mov (%rsp),%r10
1152 cmp %rbp,%rsp
1153 ja .Lpwr_page_walk
1154.Lpwr_page_walk_done:
1155
1156 mov $num,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08001157 neg $num
1158
1159 ##############################################################
1160 # Stack layout
1161 #
1162 # +0 saved $num, used in reduction section
1163 # +8 &t[2*$num], used in reduction section
1164 # +32 saved *n0
1165 # +40 saved %rsp
1166 # +48 t[2*$num]
1167 #
1168 mov $n0, 32(%rsp)
1169 mov %rax, 40(%rsp) # save original %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001170.cfi_cfa_expression %rsp+40,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001171.Lpower5_body:
David Benjamin4969cc92016-04-22 15:02:23 -04001172 movq $rptr,%xmm1 # save $rptr, used in sqr8x
Adam Langleyd9e397b2015-01-22 14:27:53 -08001173 movq $nptr,%xmm2 # save $nptr
David Benjamin4969cc92016-04-22 15:02:23 -04001174 movq %r10, %xmm3 # -$num, used in sqr8x
Adam Langleyd9e397b2015-01-22 14:27:53 -08001175 movq $bptr,%xmm4
1176
1177 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001178 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001179 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001180 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001181 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001182 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001183 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001184 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001185 call __bn_sqr8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04001186 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08001187
1188 movq %xmm2,$nptr
1189 movq %xmm4,$bptr
1190 mov $aptr,$rptr
1191 mov 40(%rsp),%rax
1192 lea 32(%rsp),$n0
1193
1194 call mul4x_internal
1195
1196 mov 40(%rsp),%rsi # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001197.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08001198 mov \$1,%rax
1199 mov -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08001200.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001201 mov -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08001202.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001203 mov -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08001204.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001205 mov -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08001206.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001207 mov -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001208.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001209 mov -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001210.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001211 lea (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001212.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001213.Lpower5_epilogue:
1214 ret
Robert Sloana94fe052017-02-21 08:49:28 -08001215.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001216.size bn_power5,.-bn_power5
1217
1218.globl bn_sqr8x_internal
1219.hidden bn_sqr8x_internal
1220.type bn_sqr8x_internal,\@abi-omnipotent
1221.align 32
1222bn_sqr8x_internal:
1223__bn_sqr8x_internal:
1224 ##############################################################
1225 # Squaring part:
1226 #
1227 # a) multiply-n-add everything but a[i]*a[i];
1228 # b) shift result of a) by 1 to the left and accumulate
1229 # a[i]*a[i] products;
1230 #
1231 ##############################################################
1232 # a[1]a[0]
1233 # a[2]a[0]
1234 # a[3]a[0]
1235 # a[2]a[1]
1236 # a[4]a[0]
1237 # a[3]a[1]
1238 # a[5]a[0]
1239 # a[4]a[1]
1240 # a[3]a[2]
1241 # a[6]a[0]
1242 # a[5]a[1]
1243 # a[4]a[2]
1244 # a[7]a[0]
1245 # a[6]a[1]
1246 # a[5]a[2]
1247 # a[4]a[3]
1248 # a[7]a[1]
1249 # a[6]a[2]
1250 # a[5]a[3]
1251 # a[7]a[2]
1252 # a[6]a[3]
1253 # a[5]a[4]
1254 # a[7]a[3]
1255 # a[6]a[4]
1256 # a[7]a[4]
1257 # a[6]a[5]
1258 # a[7]a[5]
1259 # a[7]a[6]
1260 # a[1]a[0]
1261 # a[2]a[0]
1262 # a[3]a[0]
1263 # a[4]a[0]
1264 # a[5]a[0]
1265 # a[6]a[0]
1266 # a[7]a[0]
1267 # a[2]a[1]
1268 # a[3]a[1]
1269 # a[4]a[1]
1270 # a[5]a[1]
1271 # a[6]a[1]
1272 # a[7]a[1]
1273 # a[3]a[2]
1274 # a[4]a[2]
1275 # a[5]a[2]
1276 # a[6]a[2]
1277 # a[7]a[2]
1278 # a[4]a[3]
1279 # a[5]a[3]
1280 # a[6]a[3]
1281 # a[7]a[3]
1282 # a[5]a[4]
1283 # a[6]a[4]
1284 # a[7]a[4]
1285 # a[6]a[5]
1286 # a[7]a[5]
1287 # a[7]a[6]
1288 # a[0]a[0]
1289 # a[1]a[1]
1290 # a[2]a[2]
1291 # a[3]a[3]
1292 # a[4]a[4]
1293 # a[5]a[5]
1294 # a[6]a[6]
1295 # a[7]a[7]
1296
1297 lea 32(%r10),$i # $i=-($num-32)
1298 lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
1299
1300 mov $num,$j # $j=$num
1301
1302 # comments apply to $num==8 case
1303 mov -32($aptr,$i),$a0 # a[0]
1304 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1305 mov -24($aptr,$i),%rax # a[1]
1306 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1307 mov -16($aptr,$i),$ai # a[2]
1308 mov %rax,$a1
1309
1310 mul $a0 # a[1]*a[0]
1311 mov %rax,$A0[0] # a[1]*a[0]
1312 mov $ai,%rax # a[2]
1313 mov %rdx,$A0[1]
1314 mov $A0[0],-24($tptr,$i) # t[1]
1315
1316 mul $a0 # a[2]*a[0]
1317 add %rax,$A0[1]
1318 mov $ai,%rax
1319 adc \$0,%rdx
1320 mov $A0[1],-16($tptr,$i) # t[2]
1321 mov %rdx,$A0[0]
1322
1323
1324 mov -8($aptr,$i),$ai # a[3]
1325 mul $a1 # a[2]*a[1]
1326 mov %rax,$A1[0] # a[2]*a[1]+t[3]
1327 mov $ai,%rax
1328 mov %rdx,$A1[1]
1329
1330 lea ($i),$j
1331 mul $a0 # a[3]*a[0]
1332 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1333 mov $ai,%rax
1334 mov %rdx,$A0[1]
1335 adc \$0,$A0[1]
1336 add $A1[0],$A0[0]
1337 adc \$0,$A0[1]
1338 mov $A0[0],-8($tptr,$j) # t[3]
1339 jmp .Lsqr4x_1st
1340
1341.align 32
1342.Lsqr4x_1st:
1343 mov ($aptr,$j),$ai # a[4]
1344 mul $a1 # a[3]*a[1]
1345 add %rax,$A1[1] # a[3]*a[1]+t[4]
1346 mov $ai,%rax
1347 mov %rdx,$A1[0]
1348 adc \$0,$A1[0]
1349
1350 mul $a0 # a[4]*a[0]
1351 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1352 mov $ai,%rax # a[3]
1353 mov 8($aptr,$j),$ai # a[5]
1354 mov %rdx,$A0[0]
1355 adc \$0,$A0[0]
1356 add $A1[1],$A0[1]
1357 adc \$0,$A0[0]
1358
1359
1360 mul $a1 # a[4]*a[3]
1361 add %rax,$A1[0] # a[4]*a[3]+t[5]
1362 mov $ai,%rax
1363 mov $A0[1],($tptr,$j) # t[4]
1364 mov %rdx,$A1[1]
1365 adc \$0,$A1[1]
1366
1367 mul $a0 # a[5]*a[2]
1368 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1369 mov $ai,%rax
1370 mov 16($aptr,$j),$ai # a[6]
1371 mov %rdx,$A0[1]
1372 adc \$0,$A0[1]
1373 add $A1[0],$A0[0]
1374 adc \$0,$A0[1]
1375
1376 mul $a1 # a[5]*a[3]
1377 add %rax,$A1[1] # a[5]*a[3]+t[6]
1378 mov $ai,%rax
1379 mov $A0[0],8($tptr,$j) # t[5]
1380 mov %rdx,$A1[0]
1381 adc \$0,$A1[0]
1382
1383 mul $a0 # a[6]*a[2]
1384 add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
1385 mov $ai,%rax # a[3]
1386 mov 24($aptr,$j),$ai # a[7]
1387 mov %rdx,$A0[0]
1388 adc \$0,$A0[0]
1389 add $A1[1],$A0[1]
1390 adc \$0,$A0[0]
1391
1392
1393 mul $a1 # a[6]*a[5]
1394 add %rax,$A1[0] # a[6]*a[5]+t[7]
1395 mov $ai,%rax
1396 mov $A0[1],16($tptr,$j) # t[6]
1397 mov %rdx,$A1[1]
1398 adc \$0,$A1[1]
1399 lea 32($j),$j
1400
1401 mul $a0 # a[7]*a[4]
1402 add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
1403 mov $ai,%rax
1404 mov %rdx,$A0[1]
1405 adc \$0,$A0[1]
1406 add $A1[0],$A0[0]
1407 adc \$0,$A0[1]
1408 mov $A0[0],-8($tptr,$j) # t[7]
1409
1410 cmp \$0,$j
1411 jne .Lsqr4x_1st
1412
1413 mul $a1 # a[7]*a[5]
1414 add %rax,$A1[1]
1415 lea 16($i),$i
1416 adc \$0,%rdx
1417 add $A0[1],$A1[1]
1418 adc \$0,%rdx
1419
1420 mov $A1[1],($tptr) # t[8]
1421 mov %rdx,$A1[0]
1422 mov %rdx,8($tptr) # t[9]
1423 jmp .Lsqr4x_outer
1424
1425.align 32
1426.Lsqr4x_outer: # comments apply to $num==6 case
1427 mov -32($aptr,$i),$a0 # a[0]
1428 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1429 mov -24($aptr,$i),%rax # a[1]
1430 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1431 mov -16($aptr,$i),$ai # a[2]
1432 mov %rax,$a1
1433
1434 mul $a0 # a[1]*a[0]
1435 mov -24($tptr,$i),$A0[0] # t[1]
1436 add %rax,$A0[0] # a[1]*a[0]+t[1]
1437 mov $ai,%rax # a[2]
1438 adc \$0,%rdx
1439 mov $A0[0],-24($tptr,$i) # t[1]
1440 mov %rdx,$A0[1]
1441
1442 mul $a0 # a[2]*a[0]
1443 add %rax,$A0[1]
1444 mov $ai,%rax
1445 adc \$0,%rdx
1446 add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
1447 mov %rdx,$A0[0]
1448 adc \$0,$A0[0]
1449 mov $A0[1],-16($tptr,$i) # t[2]
1450
1451 xor $A1[0],$A1[0]
1452
1453 mov -8($aptr,$i),$ai # a[3]
1454 mul $a1 # a[2]*a[1]
1455 add %rax,$A1[0] # a[2]*a[1]+t[3]
1456 mov $ai,%rax
1457 adc \$0,%rdx
1458 add -8($tptr,$i),$A1[0]
1459 mov %rdx,$A1[1]
1460 adc \$0,$A1[1]
1461
1462 mul $a0 # a[3]*a[0]
1463 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1464 mov $ai,%rax
1465 adc \$0,%rdx
1466 add $A1[0],$A0[0]
1467 mov %rdx,$A0[1]
1468 adc \$0,$A0[1]
1469 mov $A0[0],-8($tptr,$i) # t[3]
1470
1471 lea ($i),$j
1472 jmp .Lsqr4x_inner
1473
1474.align 32
1475.Lsqr4x_inner:
1476 mov ($aptr,$j),$ai # a[4]
1477 mul $a1 # a[3]*a[1]
1478 add %rax,$A1[1] # a[3]*a[1]+t[4]
1479 mov $ai,%rax
1480 mov %rdx,$A1[0]
1481 adc \$0,$A1[0]
1482 add ($tptr,$j),$A1[1]
1483 adc \$0,$A1[0]
1484
1485 .byte 0x67
1486 mul $a0 # a[4]*a[0]
1487 add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1488 mov $ai,%rax # a[3]
1489 mov 8($aptr,$j),$ai # a[5]
1490 mov %rdx,$A0[0]
1491 adc \$0,$A0[0]
1492 add $A1[1],$A0[1]
1493 adc \$0,$A0[0]
1494
1495 mul $a1 # a[4]*a[3]
1496 add %rax,$A1[0] # a[4]*a[3]+t[5]
1497 mov $A0[1],($tptr,$j) # t[4]
1498 mov $ai,%rax
1499 mov %rdx,$A1[1]
1500 adc \$0,$A1[1]
1501 add 8($tptr,$j),$A1[0]
1502 lea 16($j),$j # j++
1503 adc \$0,$A1[1]
1504
1505 mul $a0 # a[5]*a[2]
1506 add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1507 mov $ai,%rax
1508 adc \$0,%rdx
1509 add $A1[0],$A0[0]
1510 mov %rdx,$A0[1]
1511 adc \$0,$A0[1]
1512 mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
1513
1514 cmp \$0,$j
1515 jne .Lsqr4x_inner
1516
1517 .byte 0x67
1518 mul $a1 # a[5]*a[3]
1519 add %rax,$A1[1]
1520 adc \$0,%rdx
1521 add $A0[1],$A1[1]
1522 adc \$0,%rdx
1523
1524 mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
1525 mov %rdx,$A1[0]
1526 mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
1527
1528 add \$16,$i
1529 jnz .Lsqr4x_outer
1530
1531 # comments apply to $num==4 case
1532 mov -32($aptr),$a0 # a[0]
1533 lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1534 mov -24($aptr),%rax # a[1]
1535 lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1536 mov -16($aptr),$ai # a[2]
1537 mov %rax,$a1
1538
1539 mul $a0 # a[1]*a[0]
1540 add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
1541 mov $ai,%rax # a[2]
1542 mov %rdx,$A0[1]
1543 adc \$0,$A0[1]
1544
1545 mul $a0 # a[2]*a[0]
1546 add %rax,$A0[1]
1547 mov $ai,%rax
1548 mov $A0[0],-24($tptr) # t[1]
1549 mov %rdx,$A0[0]
1550 adc \$0,$A0[0]
1551 add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
1552 mov -8($aptr),$ai # a[3]
1553 adc \$0,$A0[0]
1554
1555 mul $a1 # a[2]*a[1]
1556 add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1557 mov $ai,%rax
1558 mov $A0[1],-16($tptr) # t[2]
1559 mov %rdx,$A1[1]
1560 adc \$0,$A1[1]
1561
1562 mul $a0 # a[3]*a[0]
1563 add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1564 mov $ai,%rax
1565 mov %rdx,$A0[1]
1566 adc \$0,$A0[1]
1567 add $A1[0],$A0[0]
1568 adc \$0,$A0[1]
1569 mov $A0[0],-8($tptr) # t[3]
1570
1571 mul $a1 # a[3]*a[1]
1572 add %rax,$A1[1]
1573 mov -16($aptr),%rax # a[2]
1574 adc \$0,%rdx
1575 add $A0[1],$A1[1]
1576 adc \$0,%rdx
1577
1578 mov $A1[1],($tptr) # t[4]
1579 mov %rdx,$A1[0]
1580 mov %rdx,8($tptr) # t[5]
1581
1582 mul $ai # a[2]*a[3]
1583___
1584{
1585my ($shift,$carry)=($a0,$a1);
1586my @S=(@A1,$ai,$n0);
1587$code.=<<___;
1588 add \$16,$i
1589 xor $shift,$shift
1590 sub $num,$i # $i=16-$num
1591 xor $carry,$carry
1592
1593 add $A1[0],%rax # t[5]
1594 adc \$0,%rdx
1595 mov %rax,8($tptr) # t[5]
1596 mov %rdx,16($tptr) # t[6]
1597 mov $carry,24($tptr) # t[7]
1598
1599 mov -16($aptr,$i),%rax # a[0]
1600 lea 48+8(%rsp),$tptr
1601 xor $A0[0],$A0[0] # t[0]
1602 mov 8($tptr),$A0[1] # t[1]
1603
1604 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1605 shr \$63,$A0[0]
1606 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1607 shr \$63,$A0[1]
1608 or $A0[0],$S[1] # | t[2*i]>>63
1609 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1610 mov $A0[1],$shift # shift=t[2*i+1]>>63
1611 mul %rax # a[i]*a[i]
1612 neg $carry # mov $carry,cf
1613 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1614 adc %rax,$S[0]
1615 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1616 mov $S[0],($tptr)
1617 adc %rdx,$S[1]
1618
1619 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1620 mov $S[1],8($tptr)
1621 sbb $carry,$carry # mov cf,$carry
1622 shr \$63,$A0[0]
1623 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1624 shr \$63,$A0[1]
1625 or $A0[0],$S[3] # | t[2*i]>>63
1626 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1627 mov $A0[1],$shift # shift=t[2*i+1]>>63
1628 mul %rax # a[i]*a[i]
1629 neg $carry # mov $carry,cf
1630 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1631 adc %rax,$S[2]
1632 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1633 mov $S[2],16($tptr)
1634 adc %rdx,$S[3]
1635 lea 16($i),$i
1636 mov $S[3],24($tptr)
1637 sbb $carry,$carry # mov cf,$carry
1638 lea 64($tptr),$tptr
1639 jmp .Lsqr4x_shift_n_add
1640
1641.align 32
1642.Lsqr4x_shift_n_add:
1643 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1644 shr \$63,$A0[0]
1645 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1646 shr \$63,$A0[1]
1647 or $A0[0],$S[1] # | t[2*i]>>63
1648 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1649 mov $A0[1],$shift # shift=t[2*i+1]>>63
1650 mul %rax # a[i]*a[i]
1651 neg $carry # mov $carry,cf
1652 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1653 adc %rax,$S[0]
1654 mov -8($aptr,$i),%rax # a[i+1] # prefetch
1655 mov $S[0],-32($tptr)
1656 adc %rdx,$S[1]
1657
1658 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1659 mov $S[1],-24($tptr)
1660 sbb $carry,$carry # mov cf,$carry
1661 shr \$63,$A0[0]
1662 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1663 shr \$63,$A0[1]
1664 or $A0[0],$S[3] # | t[2*i]>>63
1665 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
1666 mov $A0[1],$shift # shift=t[2*i+1]>>63
1667 mul %rax # a[i]*a[i]
1668 neg $carry # mov $carry,cf
1669 mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1670 adc %rax,$S[2]
1671 mov 0($aptr,$i),%rax # a[i+1] # prefetch
1672 mov $S[2],-16($tptr)
1673 adc %rdx,$S[3]
1674
1675 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1676 mov $S[3],-8($tptr)
1677 sbb $carry,$carry # mov cf,$carry
1678 shr \$63,$A0[0]
1679 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1680 shr \$63,$A0[1]
1681 or $A0[0],$S[1] # | t[2*i]>>63
1682 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1683 mov $A0[1],$shift # shift=t[2*i+1]>>63
1684 mul %rax # a[i]*a[i]
1685 neg $carry # mov $carry,cf
1686 mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1687 adc %rax,$S[0]
1688 mov 8($aptr,$i),%rax # a[i+1] # prefetch
1689 mov $S[0],0($tptr)
1690 adc %rdx,$S[1]
1691
1692 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1693 mov $S[1],8($tptr)
1694 sbb $carry,$carry # mov cf,$carry
1695 shr \$63,$A0[0]
1696 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1697 shr \$63,$A0[1]
1698 or $A0[0],$S[3] # | t[2*i]>>63
1699 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1700 mov $A0[1],$shift # shift=t[2*i+1]>>63
1701 mul %rax # a[i]*a[i]
1702 neg $carry # mov $carry,cf
1703 mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1704 adc %rax,$S[2]
1705 mov 16($aptr,$i),%rax # a[i+1] # prefetch
1706 mov $S[2],16($tptr)
1707 adc %rdx,$S[3]
1708 mov $S[3],24($tptr)
1709 sbb $carry,$carry # mov cf,$carry
1710 lea 64($tptr),$tptr
1711 add \$32,$i
1712 jnz .Lsqr4x_shift_n_add
1713
1714 lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1715 .byte 0x67
1716 shr \$63,$A0[0]
1717 lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1718 shr \$63,$A0[1]
1719 or $A0[0],$S[1] # | t[2*i]>>63
1720 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1721 mov $A0[1],$shift # shift=t[2*i+1]>>63
1722 mul %rax # a[i]*a[i]
1723 neg $carry # mov $carry,cf
1724 mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1725 adc %rax,$S[0]
1726 mov -8($aptr),%rax # a[i+1] # prefetch
1727 mov $S[0],-32($tptr)
1728 adc %rdx,$S[1]
1729
1730 lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1731 mov $S[1],-24($tptr)
1732 sbb $carry,$carry # mov cf,$carry
1733 shr \$63,$A0[0]
1734 lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1735 shr \$63,$A0[1]
1736 or $A0[0],$S[3] # | t[2*i]>>63
1737 mul %rax # a[i]*a[i]
1738 neg $carry # mov $carry,cf
1739 adc %rax,$S[2]
1740 adc %rdx,$S[3]
1741 mov $S[2],-16($tptr)
1742 mov $S[3],-8($tptr)
1743___
1744}
1745######################################################################
1746# Montgomery reduction part, "word-by-word" algorithm.
1747#
1748# This new path is inspired by multiple submissions from Intel, by
1749# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1750# Vinodh Gopal...
1751{
1752my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1753
1754$code.=<<___;
1755 movq %xmm2,$nptr
David Benjamin4969cc92016-04-22 15:02:23 -04001756__bn_sqr8x_reduction:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001757 xor %rax,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04001758 lea ($nptr,$num),%rcx # end of n[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001759 lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
1760 mov %rcx,0+8(%rsp)
1761 lea 48+8(%rsp,$num),$tptr # end of initial t[] window
1762 mov %rdx,8+8(%rsp)
1763 neg $num
1764 jmp .L8x_reduction_loop
1765
1766.align 32
1767.L8x_reduction_loop:
1768 lea ($tptr,$num),$tptr # start of current t[] window
1769 .byte 0x66
1770 mov 8*0($tptr),$m0
1771 mov 8*1($tptr),%r9
1772 mov 8*2($tptr),%r10
1773 mov 8*3($tptr),%r11
1774 mov 8*4($tptr),%r12
1775 mov 8*5($tptr),%r13
1776 mov 8*6($tptr),%r14
1777 mov 8*7($tptr),%r15
1778 mov %rax,(%rdx) # store top-most carry bit
1779 lea 8*8($tptr),$tptr
1780
1781 .byte 0x67
1782 mov $m0,%r8
1783 imulq 32+8(%rsp),$m0 # n0*a[0]
David Benjamin4969cc92016-04-22 15:02:23 -04001784 mov 8*0($nptr),%rax # n[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001785 mov \$8,%ecx
1786 jmp .L8x_reduce
1787
1788.align 32
1789.L8x_reduce:
1790 mulq $m0
David Benjamin4969cc92016-04-22 15:02:23 -04001791 mov 8*1($nptr),%rax # n[1]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001792 neg %r8
1793 mov %rdx,%r8
1794 adc \$0,%r8
1795
1796 mulq $m0
1797 add %rax,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001798 mov 8*2($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001799 adc \$0,%rdx
1800 add %r9,%r8
1801 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
1802 mov %rdx,%r9
1803 adc \$0,%r9
1804
1805 mulq $m0
1806 add %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001807 mov 8*3($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001808 adc \$0,%rdx
1809 add %r10,%r9
1810 mov 32+8(%rsp),$carry # pull n0, borrow $carry
1811 mov %rdx,%r10
1812 adc \$0,%r10
1813
1814 mulq $m0
1815 add %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04001816 mov 8*4($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001817 adc \$0,%rdx
1818 imulq %r8,$carry # modulo-scheduled
1819 add %r11,%r10
1820 mov %rdx,%r11
1821 adc \$0,%r11
1822
1823 mulq $m0
1824 add %rax,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04001825 mov 8*5($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001826 adc \$0,%rdx
1827 add %r12,%r11
1828 mov %rdx,%r12
1829 adc \$0,%r12
1830
1831 mulq $m0
1832 add %rax,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04001833 mov 8*6($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001834 adc \$0,%rdx
1835 add %r13,%r12
1836 mov %rdx,%r13
1837 adc \$0,%r13
1838
1839 mulq $m0
1840 add %rax,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001841 mov 8*7($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001842 adc \$0,%rdx
1843 add %r14,%r13
1844 mov %rdx,%r14
1845 adc \$0,%r14
1846
1847 mulq $m0
1848 mov $carry,$m0 # n0*a[i]
1849 add %rax,%r15
David Benjamin4969cc92016-04-22 15:02:23 -04001850 mov 8*0($nptr),%rax # n[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001851 adc \$0,%rdx
1852 add %r15,%r14
1853 mov %rdx,%r15
1854 adc \$0,%r15
1855
1856 dec %ecx
1857 jnz .L8x_reduce
1858
David Benjamin4969cc92016-04-22 15:02:23 -04001859 lea 8*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08001860 xor %rax,%rax
1861 mov 8+8(%rsp),%rdx # pull end of t[]
1862 cmp 0+8(%rsp),$nptr # end of n[]?
1863 jae .L8x_no_tail
1864
1865 .byte 0x66
1866 add 8*0($tptr),%r8
1867 adc 8*1($tptr),%r9
1868 adc 8*2($tptr),%r10
1869 adc 8*3($tptr),%r11
1870 adc 8*4($tptr),%r12
1871 adc 8*5($tptr),%r13
1872 adc 8*6($tptr),%r14
1873 adc 8*7($tptr),%r15
1874 sbb $carry,$carry # top carry
1875
1876 mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1877 mov \$8,%ecx
David Benjamin4969cc92016-04-22 15:02:23 -04001878 mov 8*0($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001879 jmp .L8x_tail
1880
1881.align 32
1882.L8x_tail:
1883 mulq $m0
1884 add %rax,%r8
David Benjamin4969cc92016-04-22 15:02:23 -04001885 mov 8*1($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001886 mov %r8,($tptr) # save result
1887 mov %rdx,%r8
1888 adc \$0,%r8
1889
1890 mulq $m0
1891 add %rax,%r9
David Benjamin4969cc92016-04-22 15:02:23 -04001892 mov 8*2($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001893 adc \$0,%rdx
1894 add %r9,%r8
1895 lea 8($tptr),$tptr # $tptr++
1896 mov %rdx,%r9
1897 adc \$0,%r9
1898
1899 mulq $m0
1900 add %rax,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04001901 mov 8*3($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001902 adc \$0,%rdx
1903 add %r10,%r9
1904 mov %rdx,%r10
1905 adc \$0,%r10
1906
1907 mulq $m0
1908 add %rax,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04001909 mov 8*4($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001910 adc \$0,%rdx
1911 add %r11,%r10
1912 mov %rdx,%r11
1913 adc \$0,%r11
1914
1915 mulq $m0
1916 add %rax,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04001917 mov 8*5($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001918 adc \$0,%rdx
1919 add %r12,%r11
1920 mov %rdx,%r12
1921 adc \$0,%r12
1922
1923 mulq $m0
1924 add %rax,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04001925 mov 8*6($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001926 adc \$0,%rdx
1927 add %r13,%r12
1928 mov %rdx,%r13
1929 adc \$0,%r13
1930
1931 mulq $m0
1932 add %rax,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001933 mov 8*7($nptr),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001934 adc \$0,%rdx
1935 add %r14,%r13
1936 mov %rdx,%r14
1937 adc \$0,%r14
1938
1939 mulq $m0
1940 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1941 add %rax,%r15
1942 adc \$0,%rdx
1943 add %r15,%r14
David Benjamin4969cc92016-04-22 15:02:23 -04001944 mov 8*0($nptr),%rax # pull n[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001945 mov %rdx,%r15
1946 adc \$0,%r15
1947
1948 dec %ecx
1949 jnz .L8x_tail
1950
David Benjamin4969cc92016-04-22 15:02:23 -04001951 lea 8*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08001952 mov 8+8(%rsp),%rdx # pull end of t[]
1953 cmp 0+8(%rsp),$nptr # end of n[]?
1954 jae .L8x_tail_done # break out of loop
1955
1956 mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1957 neg $carry
1958 mov 8*0($nptr),%rax # pull n[0]
1959 adc 8*0($tptr),%r8
1960 adc 8*1($tptr),%r9
1961 adc 8*2($tptr),%r10
1962 adc 8*3($tptr),%r11
1963 adc 8*4($tptr),%r12
1964 adc 8*5($tptr),%r13
1965 adc 8*6($tptr),%r14
1966 adc 8*7($tptr),%r15
1967 sbb $carry,$carry # top carry
1968
1969 mov \$8,%ecx
1970 jmp .L8x_tail
1971
1972.align 32
1973.L8x_tail_done:
Robert Sloan4d1ac502017-02-06 08:36:14 -08001974 xor %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001975 add (%rdx),%r8 # can this overflow?
Adam Langley4139edb2016-01-13 15:00:54 -08001976 adc \$0,%r9
1977 adc \$0,%r10
1978 adc \$0,%r11
1979 adc \$0,%r12
1980 adc \$0,%r13
1981 adc \$0,%r14
Robert Sloan4d1ac502017-02-06 08:36:14 -08001982 adc \$0,%r15
1983 adc \$0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001984
1985 neg $carry
1986.L8x_no_tail:
1987 adc 8*0($tptr),%r8
1988 adc 8*1($tptr),%r9
1989 adc 8*2($tptr),%r10
1990 adc 8*3($tptr),%r11
1991 adc 8*4($tptr),%r12
1992 adc 8*5($tptr),%r13
1993 adc 8*6($tptr),%r14
1994 adc 8*7($tptr),%r15
1995 adc \$0,%rax # top-most carry
David Benjamin4969cc92016-04-22 15:02:23 -04001996 mov -8($nptr),%rcx # np[num-1]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001997 xor $carry,$carry
1998
1999 movq %xmm2,$nptr # restore $nptr
2000
2001 mov %r8,8*0($tptr) # store top 512 bits
2002 mov %r9,8*1($tptr)
2003 movq %xmm3,$num # $num is %r9, can't be moved upwards
2004 mov %r10,8*2($tptr)
2005 mov %r11,8*3($tptr)
2006 mov %r12,8*4($tptr)
2007 mov %r13,8*5($tptr)
2008 mov %r14,8*6($tptr)
2009 mov %r15,8*7($tptr)
2010 lea 8*8($tptr),$tptr
2011
2012 cmp %rdx,$tptr # end of t[]?
2013 jb .L8x_reduction_loop
David Benjamin4969cc92016-04-22 15:02:23 -04002014 ret
2015.size bn_sqr8x_internal,.-bn_sqr8x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002016___
2017}
2018##############################################################
2019# Post-condition, 4x unrolled
2020#
2021{
2022my ($tptr,$nptr)=("%rbx","%rbp");
2023$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04002024.type __bn_post4x_internal,\@abi-omnipotent
Adam Langleyd9e397b2015-01-22 14:27:53 -08002025.align 32
David Benjamin4969cc92016-04-22 15:02:23 -04002026__bn_post4x_internal:
2027 mov 8*0($nptr),%r12
2028 lea (%rdi,$num),$tptr # %rdi was $tptr above
2029 mov $num,%rcx
2030 movq %xmm1,$rptr # restore $rptr
2031 neg %rax
2032 movq %xmm1,$aptr # prepare for back-to-back call
2033 sar \$3+2,%rcx
2034 dec %r12 # so that after 'not' we get -n[0]
2035 xor %r10,%r10
2036 mov 8*1($nptr),%r13
2037 mov 8*2($nptr),%r14
2038 mov 8*3($nptr),%r15
2039 jmp .Lsqr4x_sub_entry
2040
2041.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08002042.Lsqr4x_sub:
David Benjamin4969cc92016-04-22 15:02:23 -04002043 mov 8*0($nptr),%r12
2044 mov 8*1($nptr),%r13
2045 mov 8*2($nptr),%r14
2046 mov 8*3($nptr),%r15
2047.Lsqr4x_sub_entry:
2048 lea 8*4($nptr),$nptr
2049 not %r12
2050 not %r13
2051 not %r14
2052 not %r15
2053 and %rax,%r12
2054 and %rax,%r13
2055 and %rax,%r14
2056 and %rax,%r15
2057
2058 neg %r10 # mov %r10,%cf
2059 adc 8*0($tptr),%r12
2060 adc 8*1($tptr),%r13
2061 adc 8*2($tptr),%r14
2062 adc 8*3($tptr),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002063 mov %r12,8*0($rptr)
David Benjamin4969cc92016-04-22 15:02:23 -04002064 lea 8*4($tptr),$tptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002065 mov %r13,8*1($rptr)
David Benjamin4969cc92016-04-22 15:02:23 -04002066 sbb %r10,%r10 # mov %cf,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002067 mov %r14,8*2($rptr)
2068 mov %r15,8*3($rptr)
2069 lea 8*4($rptr),$rptr
2070
2071 inc %rcx # pass %cf
2072 jnz .Lsqr4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04002073
Adam Langleyd9e397b2015-01-22 14:27:53 -08002074 mov $num,%r10 # prepare for back-to-back call
Robert Sloana94fe052017-02-21 08:49:28 -08002075 neg $num # restore $num
Adam Langleyd9e397b2015-01-22 14:27:53 -08002076 ret
David Benjamin4969cc92016-04-22 15:02:23 -04002077.size __bn_post4x_internal,.-__bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002078___
David Benjamin4969cc92016-04-22 15:02:23 -04002079}
Adam Langleyd9e397b2015-01-22 14:27:53 -08002080{
2081$code.=<<___;
2082.globl bn_from_montgomery
2083.type bn_from_montgomery,\@abi-omnipotent
2084.align 32
2085bn_from_montgomery:
2086 testl \$7,`($win64?"48(%rsp)":"%r9d")`
2087 jz bn_from_mont8x
2088 xor %eax,%eax
2089 ret
2090.size bn_from_montgomery,.-bn_from_montgomery
2091
2092.type bn_from_mont8x,\@function,6
2093.align 32
2094bn_from_mont8x:
Robert Sloana94fe052017-02-21 08:49:28 -08002095.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002096 .byte 0x67
2097 mov %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08002098.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002099 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002100.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002101 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002102.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002103 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002104.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002105 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002106.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002107 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002108.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002109 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -08002110.cfi_push %r15
2111.Lfrom_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04002112
Adam Langleyd9e397b2015-01-22 14:27:53 -08002113 shl \$3,${num}d # convert $num to bytes
David Benjamin4969cc92016-04-22 15:02:23 -04002114 lea ($num,$num,2),%r10 # 3*$num in bytes
Adam Langleyd9e397b2015-01-22 14:27:53 -08002115 neg $num
2116 mov ($n0),$n0 # *n0
2117
2118 ##############################################################
David Benjamin4969cc92016-04-22 15:02:23 -04002119 # Ensure that stack frame doesn't alias with $rptr+3*$num
2120 # modulo 4096, which covers ret[num], am[num] and n[num]
2121 # (see bn_exp.c). The stack is allocated to aligned with
2122 # bn_power5's frame, and as bn_from_montgomery happens to be
2123 # last operation, we use the opportunity to cleanse it.
Adam Langleyd9e397b2015-01-22 14:27:53 -08002124 #
David Benjamin4969cc92016-04-22 15:02:23 -04002125 lea -320(%rsp,$num,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002126 mov %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04002127 sub $rptr,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002128 and \$4095,%r11
2129 cmp %r11,%r10
2130 jb .Lfrom_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08002131 sub %r11,%rbp # align with $aptr
2132 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002133 jmp .Lfrom_sp_done
2134
2135.align 32
2136.Lfrom_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04002137 lea 4096-320(,$num,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08002138 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002139 sub %r10,%r11
2140 mov \$0,%r10
2141 cmovc %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002142 sub %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002143.Lfrom_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08002144 and \$-64,%rbp
2145 mov %rsp,%r11
2146 sub %rbp,%r11
2147 and \$-4096,%r11
2148 lea (%rbp,%r11),%rsp
2149 mov (%rsp),%r10
2150 cmp %rbp,%rsp
2151 ja .Lfrom_page_walk
2152 jmp .Lfrom_page_walk_done
2153
2154.Lfrom_page_walk:
2155 lea -4096(%rsp),%rsp
2156 mov (%rsp),%r10
2157 cmp %rbp,%rsp
2158 ja .Lfrom_page_walk
2159.Lfrom_page_walk_done:
2160
2161 mov $num,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002162 neg $num
2163
2164 ##############################################################
2165 # Stack layout
2166 #
2167 # +0 saved $num, used in reduction section
2168 # +8 &t[2*$num], used in reduction section
2169 # +32 saved *n0
2170 # +40 saved %rsp
2171 # +48 t[2*$num]
2172 #
2173 mov $n0, 32(%rsp)
2174 mov %rax, 40(%rsp) # save original %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002175.cfi_cfa_expression %rsp+40,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002176.Lfrom_body:
2177 mov $num,%r11
2178 lea 48(%rsp),%rax
2179 pxor %xmm0,%xmm0
2180 jmp .Lmul_by_1
2181
2182.align 32
2183.Lmul_by_1:
2184 movdqu ($aptr),%xmm1
2185 movdqu 16($aptr),%xmm2
2186 movdqu 32($aptr),%xmm3
2187 movdqa %xmm0,(%rax,$num)
2188 movdqu 48($aptr),%xmm4
2189 movdqa %xmm0,16(%rax,$num)
2190 .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
2191 movdqa %xmm1,(%rax)
2192 movdqa %xmm0,32(%rax,$num)
2193 movdqa %xmm2,16(%rax)
2194 movdqa %xmm0,48(%rax,$num)
2195 movdqa %xmm3,32(%rax)
2196 movdqa %xmm4,48(%rax)
2197 lea 64(%rax),%rax
2198 sub \$64,%r11
2199 jnz .Lmul_by_1
2200
2201 movq $rptr,%xmm1
2202 movq $nptr,%xmm2
2203 .byte 0x67
2204 mov $nptr,%rbp
2205 movq %r10, %xmm3 # -num
2206___
2207$code.=<<___ if ($addx);
Robert Sloan8ff03552017-06-14 12:40:58 -07002208 leaq OPENSSL_ia32cap_P(%rip),%r11
2209 mov 8(%r11),%r11d
David Benjamin4969cc92016-04-22 15:02:23 -04002210 and \$0x80108,%r11d
2211 cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1
Adam Langleyd9e397b2015-01-22 14:27:53 -08002212 jne .Lfrom_mont_nox
2213
2214 lea (%rax,$num),$rptr
David Benjamin4969cc92016-04-22 15:02:23 -04002215 call __bn_sqrx8x_reduction
2216 call __bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002217
2218 pxor %xmm0,%xmm0
2219 lea 48(%rsp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002220 jmp .Lfrom_mont_zero
2221
2222.align 32
2223.Lfrom_mont_nox:
2224___
2225$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04002226 call __bn_sqr8x_reduction
2227 call __bn_post4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002228
2229 pxor %xmm0,%xmm0
2230 lea 48(%rsp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002231 jmp .Lfrom_mont_zero
2232
2233.align 32
2234.Lfrom_mont_zero:
Robert Sloana94fe052017-02-21 08:49:28 -08002235 mov 40(%rsp),%rsi # restore %rsp
2236.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002237 movdqa %xmm0,16*0(%rax)
2238 movdqa %xmm0,16*1(%rax)
2239 movdqa %xmm0,16*2(%rax)
2240 movdqa %xmm0,16*3(%rax)
2241 lea 16*4(%rax),%rax
2242 sub \$32,$num
2243 jnz .Lfrom_mont_zero
2244
2245 mov \$1,%rax
2246 mov -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08002247.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002248 mov -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08002249.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002250 mov -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08002251.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002252 mov -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08002253.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002254 mov -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002255.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002256 mov -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002257.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002258 lea (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002259.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002260.Lfrom_epilogue:
2261 ret
Robert Sloana94fe052017-02-21 08:49:28 -08002262.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002263.size bn_from_mont8x,.-bn_from_mont8x
2264___
2265}
2266}}}
2267
2268if ($addx) {{{
2269my $bp="%rdx"; # restore original value
2270
2271$code.=<<___;
2272.type bn_mulx4x_mont_gather5,\@function,6
2273.align 32
2274bn_mulx4x_mont_gather5:
Robert Sloana94fe052017-02-21 08:49:28 -08002275.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002276 mov %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08002277.cfi_def_cfa_register %rax
2278.Lmulx4x_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002279 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002280.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002281 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002282.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002283 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002284.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002285 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002286.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002287 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002288.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002289 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -08002290.cfi_push %r15
2291.Lmulx4x_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04002292
Adam Langleyd9e397b2015-01-22 14:27:53 -08002293 shl \$3,${num}d # convert $num to bytes
David Benjamin4969cc92016-04-22 15:02:23 -04002294 lea ($num,$num,2),%r10 # 3*$num in bytes
Adam Langleyd9e397b2015-01-22 14:27:53 -08002295 neg $num # -$num
2296 mov ($n0),$n0 # *n0
2297
2298 ##############################################################
David Benjamin4969cc92016-04-22 15:02:23 -04002299 # Ensure that stack frame doesn't alias with $rptr+3*$num
2300 # modulo 4096, which covers ret[num], am[num] and n[num]
2301 # (see bn_exp.c). This is done to allow memory disambiguation
2302 # logic do its magic. [Extra [num] is allocated in order
2303 # to align with bn_power5's frame, which is cleansed after
2304 # completing exponentiation. Extra 256 bytes is for power mask
2305 # calculated from 7th argument, the index.]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002306 #
David Benjamin4969cc92016-04-22 15:02:23 -04002307 lea -320(%rsp,$num,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002308 mov %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04002309 sub $rp,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002310 and \$4095,%r11
2311 cmp %r11,%r10
2312 jb .Lmulx4xsp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08002313 sub %r11,%rbp # align with $aptr
2314 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002315 jmp .Lmulx4xsp_done
2316
Adam Langleyd9e397b2015-01-22 14:27:53 -08002317.Lmulx4xsp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04002318 lea 4096-320(,$num,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08002319 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002320 sub %r10,%r11
2321 mov \$0,%r10
2322 cmovc %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002323 sub %r11,%rbp
2324.Lmulx4xsp_done:
2325 and \$-64,%rbp # ensure alignment
2326 mov %rsp,%r11
2327 sub %rbp,%r11
2328 and \$-4096,%r11
2329 lea (%rbp,%r11),%rsp
2330 mov (%rsp),%r10
2331 cmp %rbp,%rsp
2332 ja .Lmulx4x_page_walk
2333 jmp .Lmulx4x_page_walk_done
2334
2335.Lmulx4x_page_walk:
2336 lea -4096(%rsp),%rsp
2337 mov (%rsp),%r10
2338 cmp %rbp,%rsp
2339 ja .Lmulx4x_page_walk
2340.Lmulx4x_page_walk_done:
2341
Adam Langleyd9e397b2015-01-22 14:27:53 -08002342 ##############################################################
2343 # Stack layout
2344 # +0 -num
2345 # +8 off-loaded &b[i]
2346 # +16 end of b[num]
2347 # +24 inner counter
2348 # +32 saved n0
2349 # +40 saved %rsp
2350 # +48
2351 # +56 saved rp
2352 # +64 tmp[num+1]
2353 #
2354 mov $n0, 32(%rsp) # save *n0
2355 mov %rax,40(%rsp) # save original %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002356.cfi_cfa_expression %rsp+40,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002357.Lmulx4x_body:
2358 call mulx4x_internal
2359
2360 mov 40(%rsp),%rsi # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002361.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002362 mov \$1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04002363
Adam Langleyd9e397b2015-01-22 14:27:53 -08002364 mov -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08002365.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002366 mov -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08002367.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002368 mov -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08002369.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002370 mov -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08002371.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002372 mov -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002373.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002374 mov -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002375.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002376 lea (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002377.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002378.Lmulx4x_epilogue:
2379 ret
Robert Sloana94fe052017-02-21 08:49:28 -08002380.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002381.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2382
2383.type mulx4x_internal,\@abi-omnipotent
2384.align 32
2385mulx4x_internal:
David Benjamin4969cc92016-04-22 15:02:23 -04002386 mov $num,8(%rsp) # save -$num (it was in bytes)
2387 mov $num,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002388 neg $num # restore $num
2389 shl \$5,$num
David Benjamin4969cc92016-04-22 15:02:23 -04002390 neg %r10 # restore $num
2391 lea 128($bp,$num),%r13 # end of powers table (+size optimization)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002392 shr \$5+5,$num
David Benjamin4969cc92016-04-22 15:02:23 -04002393 movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument
Adam Langleyd9e397b2015-01-22 14:27:53 -08002394 sub \$1,$num
David Benjamin4969cc92016-04-22 15:02:23 -04002395 lea .Linc(%rip),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002396 mov %r13,16+8(%rsp) # end of b[num]
2397 mov $num,24+8(%rsp) # inner counter
2398 mov $rp, 56+8(%rsp) # save $rp
2399___
2400my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
2401 ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2402my $rptr=$bptr;
2403my $STRIDE=2**5*8; # 5 is "window size"
2404my $N=$STRIDE/4; # should match cache line size
2405$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04002406 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
2407 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
2408 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimizaton)
2409 lea 128($bp),$bptr # size optimization
Adam Langleyd9e397b2015-01-22 14:27:53 -08002410
David Benjamin4969cc92016-04-22 15:02:23 -04002411 pshufd \$0,%xmm5,%xmm5 # broadcast index
2412 movdqa %xmm1,%xmm4
2413 .byte 0x67
2414 movdqa %xmm1,%xmm2
2415___
2416########################################################################
2417# calculate mask by comparing 0..31 to index and save result to stack
2418#
2419$code.=<<___;
2420 .byte 0x67
2421 paddd %xmm0,%xmm1
2422 pcmpeqd %xmm5,%xmm0 # compare to 1,0
2423 movdqa %xmm4,%xmm3
2424___
2425for($i=0;$i<$STRIDE/16-4;$i+=4) {
2426$code.=<<___;
2427 paddd %xmm1,%xmm2
2428 pcmpeqd %xmm5,%xmm1 # compare to 3,2
2429 movdqa %xmm0,`16*($i+0)+112`(%r10)
2430 movdqa %xmm4,%xmm0
2431
2432 paddd %xmm2,%xmm3
2433 pcmpeqd %xmm5,%xmm2 # compare to 5,4
2434 movdqa %xmm1,`16*($i+1)+112`(%r10)
2435 movdqa %xmm4,%xmm1
2436
2437 paddd %xmm3,%xmm0
2438 pcmpeqd %xmm5,%xmm3 # compare to 7,6
2439 movdqa %xmm2,`16*($i+2)+112`(%r10)
2440 movdqa %xmm4,%xmm2
2441
2442 paddd %xmm0,%xmm1
2443 pcmpeqd %xmm5,%xmm0
2444 movdqa %xmm3,`16*($i+3)+112`(%r10)
2445 movdqa %xmm4,%xmm3
2446___
2447}
2448$code.=<<___; # last iteration can be optimized
2449 .byte 0x67
2450 paddd %xmm1,%xmm2
2451 pcmpeqd %xmm5,%xmm1
2452 movdqa %xmm0,`16*($i+0)+112`(%r10)
2453
2454 paddd %xmm2,%xmm3
2455 pcmpeqd %xmm5,%xmm2
2456 movdqa %xmm1,`16*($i+1)+112`(%r10)
2457
2458 pcmpeqd %xmm5,%xmm3
2459 movdqa %xmm2,`16*($i+2)+112`(%r10)
2460
2461 pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register
2462 pand `16*($i+1)-128`($bptr),%xmm1
2463 pand `16*($i+2)-128`($bptr),%xmm2
2464 movdqa %xmm3,`16*($i+3)+112`(%r10)
2465 pand `16*($i+3)-128`($bptr),%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -08002466 por %xmm2,%xmm0
David Benjamin4969cc92016-04-22 15:02:23 -04002467 por %xmm3,%xmm1
2468___
2469for($i=0;$i<$STRIDE/16-4;$i+=4) {
2470$code.=<<___;
2471 movdqa `16*($i+0)-128`($bptr),%xmm4
2472 movdqa `16*($i+1)-128`($bptr),%xmm5
2473 movdqa `16*($i+2)-128`($bptr),%xmm2
2474 pand `16*($i+0)+112`(%r10),%xmm4
2475 movdqa `16*($i+3)-128`($bptr),%xmm3
2476 pand `16*($i+1)+112`(%r10),%xmm5
2477 por %xmm4,%xmm0
2478 pand `16*($i+2)+112`(%r10),%xmm2
2479 por %xmm5,%xmm1
2480 pand `16*($i+3)+112`(%r10),%xmm3
2481 por %xmm2,%xmm0
2482 por %xmm3,%xmm1
2483___
2484}
2485$code.=<<___;
2486 pxor %xmm1,%xmm0
2487 pshufd \$0x4e,%xmm0,%xmm1
2488 por %xmm1,%xmm0
2489 lea $STRIDE($bptr),$bptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002490 movq %xmm0,%rdx # bp[0]
David Benjamin4969cc92016-04-22 15:02:23 -04002491 lea 64+8*4+8(%rsp),$tptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002492
2493 mov %rdx,$bi
2494 mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
2495 mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
2496 add %rax,%r11
2497 mulx 2*8($aptr),%rax,%r13 # ...
2498 adc %rax,%r12
2499 adc \$0,%r13
2500 mulx 3*8($aptr),%rax,%r14
2501
2502 mov $mi,%r15
2503 imulq 32+8(%rsp),$mi # "t[0]"*n0
2504 xor $zero,$zero # cf=0, of=0
2505 mov $mi,%rdx
2506
Adam Langleyd9e397b2015-01-22 14:27:53 -08002507 mov $bptr,8+8(%rsp) # off-load &b[i]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002508
David Benjamin4969cc92016-04-22 15:02:23 -04002509 lea 4*8($aptr),$aptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002510 adcx %rax,%r13
2511 adcx $zero,%r14 # cf=0
2512
David Benjamin4969cc92016-04-22 15:02:23 -04002513 mulx 0*8($nptr),%rax,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002514 adcx %rax,%r15 # discarded
2515 adox %r11,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04002516 mulx 1*8($nptr),%rax,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002517 adcx %rax,%r10
2518 adox %r12,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04002519 mulx 2*8($nptr),%rax,%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002520 mov 24+8(%rsp),$bptr # counter value
Adam Langleyd9e397b2015-01-22 14:27:53 -08002521 mov %r10,-8*4($tptr)
2522 adcx %rax,%r11
2523 adox %r13,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04002524 mulx 3*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002525 mov $bi,%rdx
2526 mov %r11,-8*3($tptr)
2527 adcx %rax,%r12
2528 adox $zero,%r15 # of=0
David Benjamin4969cc92016-04-22 15:02:23 -04002529 lea 4*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002530 mov %r12,-8*2($tptr)
David Benjamin4969cc92016-04-22 15:02:23 -04002531 jmp .Lmulx4x_1st
Adam Langleyd9e397b2015-01-22 14:27:53 -08002532
2533.align 32
2534.Lmulx4x_1st:
2535 adcx $zero,%r15 # cf=0, modulo-scheduled
2536 mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
2537 adcx %r14,%r10
2538 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
2539 adcx %rax,%r11
2540 mulx 2*8($aptr),%r12,%rax # ...
2541 adcx %r14,%r12
2542 mulx 3*8($aptr),%r13,%r14
2543 .byte 0x67,0x67
2544 mov $mi,%rdx
2545 adcx %rax,%r13
2546 adcx $zero,%r14 # cf=0
2547 lea 4*8($aptr),$aptr
2548 lea 4*8($tptr),$tptr
2549
2550 adox %r15,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04002551 mulx 0*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002552 adcx %rax,%r10
2553 adox %r15,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04002554 mulx 1*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002555 adcx %rax,%r11
2556 adox %r15,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04002557 mulx 2*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002558 mov %r10,-5*8($tptr)
2559 adcx %rax,%r12
2560 mov %r11,-4*8($tptr)
2561 adox %r15,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04002562 mulx 3*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002563 mov $bi,%rdx
2564 mov %r12,-3*8($tptr)
2565 adcx %rax,%r13
2566 adox $zero,%r15
David Benjamin4969cc92016-04-22 15:02:23 -04002567 lea 4*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002568 mov %r13,-2*8($tptr)
2569
2570 dec $bptr # of=0, pass cf
2571 jnz .Lmulx4x_1st
2572
2573 mov 8(%rsp),$num # load -num
Adam Langleyd9e397b2015-01-22 14:27:53 -08002574 adc $zero,%r15 # modulo-scheduled
2575 lea ($aptr,$num),$aptr # rewind $aptr
2576 add %r15,%r14
2577 mov 8+8(%rsp),$bptr # re-load &b[i]
2578 adc $zero,$zero # top-most carry
2579 mov %r14,-1*8($tptr)
2580 jmp .Lmulx4x_outer
2581
2582.align 32
2583.Lmulx4x_outer:
David Benjamin4969cc92016-04-22 15:02:23 -04002584 lea 16-256($tptr),%r10 # where 256-byte mask is (+density control)
2585 pxor %xmm4,%xmm4
2586 .byte 0x67,0x67
2587 pxor %xmm5,%xmm5
2588___
2589for($i=0;$i<$STRIDE/16;$i+=4) {
2590$code.=<<___;
2591 movdqa `16*($i+0)-128`($bptr),%xmm0
2592 movdqa `16*($i+1)-128`($bptr),%xmm1
2593 movdqa `16*($i+2)-128`($bptr),%xmm2
2594 pand `16*($i+0)+256`(%r10),%xmm0
2595 movdqa `16*($i+3)-128`($bptr),%xmm3
2596 pand `16*($i+1)+256`(%r10),%xmm1
2597 por %xmm0,%xmm4
2598 pand `16*($i+2)+256`(%r10),%xmm2
2599 por %xmm1,%xmm5
2600 pand `16*($i+3)+256`(%r10),%xmm3
2601 por %xmm2,%xmm4
2602 por %xmm3,%xmm5
2603___
2604}
2605$code.=<<___;
2606 por %xmm5,%xmm4
2607 pshufd \$0x4e,%xmm4,%xmm0
2608 por %xmm4,%xmm0
2609 lea $STRIDE($bptr),$bptr
2610 movq %xmm0,%rdx # m0=bp[i]
2611
Adam Langleyd9e397b2015-01-22 14:27:53 -08002612 mov $zero,($tptr) # save top-most carry
2613 lea 4*8($tptr,$num),$tptr # rewind $tptr
2614 mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
2615 xor $zero,$zero # cf=0, of=0
2616 mov %rdx,$bi
2617 mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
2618 adox -4*8($tptr),$mi # +t[0]
2619 adcx %r14,%r11
2620 mulx 2*8($aptr),%r15,%r13 # ...
2621 adox -3*8($tptr),%r11
2622 adcx %r15,%r12
2623 mulx 3*8($aptr),%rdx,%r14
2624 adox -2*8($tptr),%r12
2625 adcx %rdx,%r13
David Benjamin4969cc92016-04-22 15:02:23 -04002626 lea ($nptr,$num),$nptr # rewind $nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002627 lea 4*8($aptr),$aptr
2628 adox -1*8($tptr),%r13
2629 adcx $zero,%r14
2630 adox $zero,%r14
2631
Adam Langleyd9e397b2015-01-22 14:27:53 -08002632 mov $mi,%r15
2633 imulq 32+8(%rsp),$mi # "t[0]"*n0
2634
Adam Langleyd9e397b2015-01-22 14:27:53 -08002635 mov $mi,%rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002636 xor $zero,$zero # cf=0, of=0
2637 mov $bptr,8+8(%rsp) # off-load &b[i]
2638
David Benjamin4969cc92016-04-22 15:02:23 -04002639 mulx 0*8($nptr),%rax,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002640 adcx %rax,%r15 # discarded
2641 adox %r11,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04002642 mulx 1*8($nptr),%rax,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002643 adcx %rax,%r10
2644 adox %r12,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04002645 mulx 2*8($nptr),%rax,%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002646 adcx %rax,%r11
2647 adox %r13,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04002648 mulx 3*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002649 mov $bi,%rdx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002650 mov 24+8(%rsp),$bptr # counter value
2651 mov %r10,-8*4($tptr)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002652 adcx %rax,%r12
2653 mov %r11,-8*3($tptr)
2654 adox $zero,%r15 # of=0
2655 mov %r12,-8*2($tptr)
David Benjamin4969cc92016-04-22 15:02:23 -04002656 lea 4*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002657 jmp .Lmulx4x_inner
2658
2659.align 32
2660.Lmulx4x_inner:
2661 mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
2662 adcx $zero,%r15 # cf=0, modulo-scheduled
2663 adox %r14,%r10
2664 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
2665 adcx 0*8($tptr),%r10
2666 adox %rax,%r11
2667 mulx 2*8($aptr),%r12,%rax # ...
2668 adcx 1*8($tptr),%r11
2669 adox %r14,%r12
2670 mulx 3*8($aptr),%r13,%r14
2671 mov $mi,%rdx
2672 adcx 2*8($tptr),%r12
2673 adox %rax,%r13
2674 adcx 3*8($tptr),%r13
2675 adox $zero,%r14 # of=0
2676 lea 4*8($aptr),$aptr
2677 lea 4*8($tptr),$tptr
2678 adcx $zero,%r14 # cf=0
2679
2680 adox %r15,%r10
David Benjamin4969cc92016-04-22 15:02:23 -04002681 mulx 0*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002682 adcx %rax,%r10
2683 adox %r15,%r11
David Benjamin4969cc92016-04-22 15:02:23 -04002684 mulx 1*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002685 adcx %rax,%r11
2686 adox %r15,%r12
David Benjamin4969cc92016-04-22 15:02:23 -04002687 mulx 2*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002688 mov %r10,-5*8($tptr)
2689 adcx %rax,%r12
2690 adox %r15,%r13
2691 mov %r11,-4*8($tptr)
David Benjamin4969cc92016-04-22 15:02:23 -04002692 mulx 3*8($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002693 mov $bi,%rdx
David Benjamin4969cc92016-04-22 15:02:23 -04002694 lea 4*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08002695 mov %r12,-3*8($tptr)
2696 adcx %rax,%r13
2697 adox $zero,%r15
2698 mov %r13,-2*8($tptr)
2699
2700 dec $bptr # of=0, pass cf
2701 jnz .Lmulx4x_inner
2702
2703 mov 0+8(%rsp),$num # load -num
Adam Langleyd9e397b2015-01-22 14:27:53 -08002704 adc $zero,%r15 # modulo-scheduled
2705 sub 0*8($tptr),$bptr # pull top-most carry to %cf
2706 mov 8+8(%rsp),$bptr # re-load &b[i]
2707 mov 16+8(%rsp),%r10
2708 adc %r15,%r14
2709 lea ($aptr,$num),$aptr # rewind $aptr
2710 adc $zero,$zero # top-most carry
2711 mov %r14,-1*8($tptr)
2712
2713 cmp %r10,$bptr
2714 jb .Lmulx4x_outer
2715
David Benjamin4969cc92016-04-22 15:02:23 -04002716 mov -8($nptr),%r10
2717 mov $zero,%r8
2718 mov ($nptr,$num),%r12
2719 lea ($nptr,$num),%rbp # rewind $nptr
2720 mov $num,%rcx
2721 lea ($tptr,$num),%rdi # rewind $tptr
2722 xor %eax,%eax
Adam Langleyd9e397b2015-01-22 14:27:53 -08002723 xor %r15,%r15
2724 sub %r14,%r10 # compare top-most words
2725 adc %r15,%r15
David Benjamin4969cc92016-04-22 15:02:23 -04002726 or %r15,%r8
2727 sar \$3+2,%rcx
2728 sub %r8,%rax # %rax=-%r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002729 mov 56+8(%rsp),%rdx # restore rp
David Benjamin4969cc92016-04-22 15:02:23 -04002730 dec %r12 # so that after 'not' we get -n[0]
2731 mov 8*1(%rbp),%r13
2732 xor %r8,%r8
2733 mov 8*2(%rbp),%r14
2734 mov 8*3(%rbp),%r15
2735 jmp .Lsqrx4x_sub_entry # common post-condition
Adam Langleyd9e397b2015-01-22 14:27:53 -08002736.size mulx4x_internal,.-mulx4x_internal
2737___
2738} {
2739######################################################################
2740# void bn_power5(
2741my $rptr="%rdi"; # BN_ULONG *rptr,
2742my $aptr="%rsi"; # const BN_ULONG *aptr,
2743my $bptr="%rdx"; # const void *table,
2744my $nptr="%rcx"; # const BN_ULONG *nptr,
2745my $n0 ="%r8"; # const BN_ULONG *n0);
2746my $num ="%r9"; # int num, has to be divisible by 8
2747 # int pwr);
2748
2749my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2750my @A0=("%r10","%r11");
2751my @A1=("%r12","%r13");
2752my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2753
2754$code.=<<___;
2755.type bn_powerx5,\@function,6
2756.align 32
2757bn_powerx5:
Robert Sloana94fe052017-02-21 08:49:28 -08002758.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002759 mov %rsp,%rax
Robert Sloana94fe052017-02-21 08:49:28 -08002760.cfi_def_cfa_register %rax
2761.Lpowerx5_enter:
Adam Langleyd9e397b2015-01-22 14:27:53 -08002762 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002763.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002764 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002765.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002766 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -08002767.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002768 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -08002769.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002770 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -08002771.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002772 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -08002773.cfi_push %r15
2774.Lpowerx5_prologue:
David Benjamin4969cc92016-04-22 15:02:23 -04002775
Adam Langleyd9e397b2015-01-22 14:27:53 -08002776 shl \$3,${num}d # convert $num to bytes
David Benjamin4969cc92016-04-22 15:02:23 -04002777 lea ($num,$num,2),%r10 # 3*$num in bytes
Adam Langleyd9e397b2015-01-22 14:27:53 -08002778 neg $num
2779 mov ($n0),$n0 # *n0
2780
2781 ##############################################################
David Benjamin4969cc92016-04-22 15:02:23 -04002782 # Ensure that stack frame doesn't alias with $rptr+3*$num
2783 # modulo 4096, which covers ret[num], am[num] and n[num]
2784 # (see bn_exp.c). This is done to allow memory disambiguation
2785 # logic do its magic. [Extra 256 bytes is for power mask
2786 # calculated from 7th argument, the index.]
Adam Langleyd9e397b2015-01-22 14:27:53 -08002787 #
David Benjamin4969cc92016-04-22 15:02:23 -04002788 lea -320(%rsp,$num,2),%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002789 mov %rsp,%rbp
David Benjamin4969cc92016-04-22 15:02:23 -04002790 sub $rptr,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08002791 and \$4095,%r11
2792 cmp %r11,%r10
2793 jb .Lpwrx_sp_alt
Robert Sloana94fe052017-02-21 08:49:28 -08002794 sub %r11,%rbp # align with $aptr
2795 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002796 jmp .Lpwrx_sp_done
2797
2798.align 32
2799.Lpwrx_sp_alt:
David Benjamin4969cc92016-04-22 15:02:23 -04002800 lea 4096-320(,$num,2),%r10
Robert Sloana94fe052017-02-21 08:49:28 -08002801 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256)
Adam Langleyd9e397b2015-01-22 14:27:53 -08002802 sub %r10,%r11
2803 mov \$0,%r10
2804 cmovc %r10,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08002805 sub %r11,%rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002806.Lpwrx_sp_done:
Robert Sloana94fe052017-02-21 08:49:28 -08002807 and \$-64,%rbp
2808 mov %rsp,%r11
2809 sub %rbp,%r11
2810 and \$-4096,%r11
2811 lea (%rbp,%r11),%rsp
2812 mov (%rsp),%r10
2813 cmp %rbp,%rsp
2814 ja .Lpwrx_page_walk
2815 jmp .Lpwrx_page_walk_done
2816
2817.Lpwrx_page_walk:
2818 lea -4096(%rsp),%rsp
2819 mov (%rsp),%r10
2820 cmp %rbp,%rsp
2821 ja .Lpwrx_page_walk
2822.Lpwrx_page_walk_done:
2823
2824 mov $num,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08002825 neg $num
2826
2827 ##############################################################
2828 # Stack layout
2829 #
2830 # +0 saved $num, used in reduction section
2831 # +8 &t[2*$num], used in reduction section
2832 # +16 intermediate carry bit
2833 # +24 top-most carry bit, used in reduction section
2834 # +32 saved *n0
2835 # +40 saved %rsp
2836 # +48 t[2*$num]
2837 #
2838 pxor %xmm0,%xmm0
2839 movq $rptr,%xmm1 # save $rptr
2840 movq $nptr,%xmm2 # save $nptr
2841 movq %r10, %xmm3 # -$num
2842 movq $bptr,%xmm4
2843 mov $n0, 32(%rsp)
2844 mov %rax, 40(%rsp) # save original %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002845.cfi_cfa_expression %rsp+40,deref,+8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002846.Lpowerx5_body:
2847
2848 call __bn_sqrx8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04002849 call __bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002850 call __bn_sqrx8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04002851 call __bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002852 call __bn_sqrx8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04002853 call __bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002854 call __bn_sqrx8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04002855 call __bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002856 call __bn_sqrx8x_internal
David Benjamin4969cc92016-04-22 15:02:23 -04002857 call __bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08002858
2859 mov %r10,$num # -num
2860 mov $aptr,$rptr
2861 movq %xmm2,$nptr
2862 movq %xmm4,$bptr
2863 mov 40(%rsp),%rax
2864
2865 call mulx4x_internal
2866
2867 mov 40(%rsp),%rsi # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002868.cfi_def_cfa %rsi,8
Adam Langleyd9e397b2015-01-22 14:27:53 -08002869 mov \$1,%rax
David Benjamin4969cc92016-04-22 15:02:23 -04002870
Adam Langleyd9e397b2015-01-22 14:27:53 -08002871 mov -48(%rsi),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08002872.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08002873 mov -40(%rsi),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08002874.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08002875 mov -32(%rsi),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08002876.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08002877 mov -24(%rsi),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08002878.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08002879 mov -16(%rsi),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08002880.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002881 mov -8(%rsi),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08002882.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08002883 lea (%rsi),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08002884.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08002885.Lpowerx5_epilogue:
2886 ret
Robert Sloana94fe052017-02-21 08:49:28 -08002887.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08002888.size bn_powerx5,.-bn_powerx5
2889
2890.globl bn_sqrx8x_internal
2891.hidden bn_sqrx8x_internal
2892.type bn_sqrx8x_internal,\@abi-omnipotent
2893.align 32
2894bn_sqrx8x_internal:
2895__bn_sqrx8x_internal:
2896 ##################################################################
2897 # Squaring part:
2898 #
2899 # a) multiply-n-add everything but a[i]*a[i];
2900 # b) shift result of a) by 1 to the left and accumulate
2901 # a[i]*a[i] products;
2902 #
2903 ##################################################################
2904 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2905 # a[1]a[0]
2906 # a[2]a[0]
2907 # a[3]a[0]
2908 # a[2]a[1]
2909 # a[3]a[1]
2910 # a[3]a[2]
2911 #
2912 # a[4]a[0]
2913 # a[5]a[0]
2914 # a[6]a[0]
2915 # a[7]a[0]
2916 # a[4]a[1]
2917 # a[5]a[1]
2918 # a[6]a[1]
2919 # a[7]a[1]
2920 # a[4]a[2]
2921 # a[5]a[2]
2922 # a[6]a[2]
2923 # a[7]a[2]
2924 # a[4]a[3]
2925 # a[5]a[3]
2926 # a[6]a[3]
2927 # a[7]a[3]
2928 #
2929 # a[5]a[4]
2930 # a[6]a[4]
2931 # a[7]a[4]
2932 # a[6]a[5]
2933 # a[7]a[5]
2934 # a[7]a[6]
2935 # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2936___
2937{
2938my ($zero,$carry)=("%rbp","%rcx");
2939my $aaptr=$zero;
2940$code.=<<___;
2941 lea 48+8(%rsp),$tptr
2942 lea ($aptr,$num),$aaptr
2943 mov $num,0+8(%rsp) # save $num
2944 mov $aaptr,8+8(%rsp) # save end of $aptr
2945 jmp .Lsqr8x_zero_start
2946
2947.align 32
2948.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2949.Lsqrx8x_zero:
2950 .byte 0x3e
2951 movdqa %xmm0,0*8($tptr)
2952 movdqa %xmm0,2*8($tptr)
2953 movdqa %xmm0,4*8($tptr)
2954 movdqa %xmm0,6*8($tptr)
2955.Lsqr8x_zero_start: # aligned at 32
2956 movdqa %xmm0,8*8($tptr)
2957 movdqa %xmm0,10*8($tptr)
2958 movdqa %xmm0,12*8($tptr)
2959 movdqa %xmm0,14*8($tptr)
2960 lea 16*8($tptr),$tptr
2961 sub \$64,$num
2962 jnz .Lsqrx8x_zero
2963
2964 mov 0*8($aptr),%rdx # a[0], modulo-scheduled
2965 #xor %r9,%r9 # t[1], ex-$num, zero already
2966 xor %r10,%r10
2967 xor %r11,%r11
2968 xor %r12,%r12
2969 xor %r13,%r13
2970 xor %r14,%r14
2971 xor %r15,%r15
2972 lea 48+8(%rsp),$tptr
2973 xor $zero,$zero # cf=0, cf=0
2974 jmp .Lsqrx8x_outer_loop
2975
2976.align 32
2977.Lsqrx8x_outer_loop:
2978 mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
2979 adcx %r9,%r8 # a[1]*a[0]+=t[1]
2980 adox %rax,%r10
2981 mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
2982 adcx %r10,%r9
2983 adox %rax,%r11
2984 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
2985 adcx %r11,%r10
2986 adox %rax,%r12
2987 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
2988 adcx %r12,%r11
2989 adox %rax,%r13
2990 mulx 5*8($aptr),%r12,%rax
2991 adcx %r13,%r12
2992 adox %rax,%r14
2993 mulx 6*8($aptr),%r13,%rax
2994 adcx %r14,%r13
2995 adox %r15,%rax
2996 mulx 7*8($aptr),%r14,%r15
2997 mov 1*8($aptr),%rdx # a[1]
2998 adcx %rax,%r14
2999 adox $zero,%r15
3000 adc 8*8($tptr),%r15
3001 mov %r8,1*8($tptr) # t[1]
3002 mov %r9,2*8($tptr) # t[2]
3003 sbb $carry,$carry # mov %cf,$carry
3004 xor $zero,$zero # cf=0, of=0
3005
3006
3007 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
3008 mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
3009 adcx %r10,%r8
3010 adox %rbx,%r9
3011 mulx 4*8($aptr),%r10,%rbx # ...
3012 adcx %r11,%r9
3013 adox %rax,%r10
3014 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
3015 adcx %r12,%r10
3016 adox %rbx,%r11
3017 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
3018 adcx %r13,%r11
3019 adox %r14,%r12
3020 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
3021 mov 2*8($aptr),%rdx # a[2]
3022 adcx %rax,%r12
3023 adox %rbx,%r13
3024 adcx %r15,%r13
3025 adox $zero,%r14 # of=0
3026 adcx $zero,%r14 # cf=0
3027
3028 mov %r8,3*8($tptr) # t[3]
3029 mov %r9,4*8($tptr) # t[4]
3030
3031 mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
3032 mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
3033 adcx %r10,%r8
3034 adox %rbx,%r9
3035 mulx 5*8($aptr),%r10,%rbx # ...
3036 adcx %r11,%r9
3037 adox %rax,%r10
3038 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
3039 adcx %r12,%r10
3040 adox %r13,%r11
3041 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
3042 .byte 0x3e
3043 mov 3*8($aptr),%rdx # a[3]
3044 adcx %rbx,%r11
3045 adox %rax,%r12
3046 adcx %r14,%r12
3047 mov %r8,5*8($tptr) # t[5]
3048 mov %r9,6*8($tptr) # t[6]
3049 mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
3050 adox $zero,%r13 # of=0
3051 adcx $zero,%r13 # cf=0
3052
3053 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
3054 adcx %r10,%r8
3055 adox %rax,%r9
3056 mulx 6*8($aptr),%r10,%rax # ...
3057 adcx %r11,%r9
3058 adox %r12,%r10
3059 mulx 7*8($aptr),%r11,%r12
3060 mov 4*8($aptr),%rdx # a[4]
3061 mov 5*8($aptr),%r14 # a[5]
3062 adcx %rbx,%r10
3063 adox %rax,%r11
3064 mov 6*8($aptr),%r15 # a[6]
3065 adcx %r13,%r11
3066 adox $zero,%r12 # of=0
3067 adcx $zero,%r12 # cf=0
3068
3069 mov %r8,7*8($tptr) # t[7]
3070 mov %r9,8*8($tptr) # t[8]
3071
3072 mulx %r14,%r9,%rax # a[5]*a[4]
3073 mov 7*8($aptr),%r8 # a[7]
3074 adcx %r10,%r9
3075 mulx %r15,%r10,%rbx # a[6]*a[4]
3076 adox %rax,%r10
3077 adcx %r11,%r10
3078 mulx %r8,%r11,%rax # a[7]*a[4]
3079 mov %r14,%rdx # a[5]
3080 adox %rbx,%r11
3081 adcx %r12,%r11
3082 #adox $zero,%rax # of=0
3083 adcx $zero,%rax # cf=0
3084
3085 mulx %r15,%r14,%rbx # a[6]*a[5]
3086 mulx %r8,%r12,%r13 # a[7]*a[5]
3087 mov %r15,%rdx # a[6]
3088 lea 8*8($aptr),$aptr
3089 adcx %r14,%r11
3090 adox %rbx,%r12
3091 adcx %rax,%r12
3092 adox $zero,%r13
3093
3094 .byte 0x67,0x67
3095 mulx %r8,%r8,%r14 # a[7]*a[6]
3096 adcx %r8,%r13
3097 adcx $zero,%r14
3098
3099 cmp 8+8(%rsp),$aptr
3100 je .Lsqrx8x_outer_break
3101
3102 neg $carry # mov $carry,%cf
3103 mov \$-8,%rcx
3104 mov $zero,%r15
3105 mov 8*8($tptr),%r8
3106 adcx 9*8($tptr),%r9 # +=t[9]
3107 adcx 10*8($tptr),%r10 # ...
3108 adcx 11*8($tptr),%r11
3109 adc 12*8($tptr),%r12
3110 adc 13*8($tptr),%r13
3111 adc 14*8($tptr),%r14
3112 adc 15*8($tptr),%r15
3113 lea ($aptr),$aaptr
3114 lea 2*64($tptr),$tptr
3115 sbb %rax,%rax # mov %cf,$carry
3116
3117 mov -64($aptr),%rdx # a[0]
3118 mov %rax,16+8(%rsp) # offload $carry
3119 mov $tptr,24+8(%rsp)
3120
3121 #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
3122 xor %eax,%eax # cf=0, of=0
3123 jmp .Lsqrx8x_loop
3124
3125.align 32
3126.Lsqrx8x_loop:
3127 mov %r8,%rbx
3128 mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
3129 adcx %rax,%rbx # +=t[8]
3130 adox %r9,%r8
3131
3132 mulx 1*8($aaptr),%rax,%r9 # ...
3133 adcx %rax,%r8
3134 adox %r10,%r9
3135
3136 mulx 2*8($aaptr),%rax,%r10
3137 adcx %rax,%r9
3138 adox %r11,%r10
3139
3140 mulx 3*8($aaptr),%rax,%r11
3141 adcx %rax,%r10
3142 adox %r12,%r11
3143
3144 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
3145 adcx %rax,%r11
3146 adox %r13,%r12
3147
3148 mulx 5*8($aaptr),%rax,%r13
3149 adcx %rax,%r12
3150 adox %r14,%r13
3151
3152 mulx 6*8($aaptr),%rax,%r14
3153 mov %rbx,($tptr,%rcx,8) # store t[8+i]
3154 mov \$0,%ebx
3155 adcx %rax,%r13
3156 adox %r15,%r14
3157
3158 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
3159 mov 8($aptr,%rcx,8),%rdx # a[i]
3160 adcx %rax,%r14
3161 adox %rbx,%r15 # %rbx is 0, of=0
3162 adcx %rbx,%r15 # cf=0
3163
3164 .byte 0x67
3165 inc %rcx # of=0
3166 jnz .Lsqrx8x_loop
3167
3168 lea 8*8($aaptr),$aaptr
3169 mov \$-8,%rcx
3170 cmp 8+8(%rsp),$aaptr # done?
3171 je .Lsqrx8x_break
3172
3173 sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
3174 .byte 0x66
3175 mov -64($aptr),%rdx
3176 adcx 0*8($tptr),%r8
3177 adcx 1*8($tptr),%r9
3178 adc 2*8($tptr),%r10
3179 adc 3*8($tptr),%r11
3180 adc 4*8($tptr),%r12
3181 adc 5*8($tptr),%r13
3182 adc 6*8($tptr),%r14
3183 adc 7*8($tptr),%r15
3184 lea 8*8($tptr),$tptr
3185 .byte 0x67
3186 sbb %rax,%rax # mov %cf,%rax
3187 xor %ebx,%ebx # cf=0, of=0
3188 mov %rax,16+8(%rsp) # offload carry
3189 jmp .Lsqrx8x_loop
3190
3191.align 32
3192.Lsqrx8x_break:
3193 sub 16+8(%rsp),%r8 # consume last carry
3194 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
3195 mov 0*8($aptr),%rdx # a[8], modulo-scheduled
3196 xor %ebp,%ebp # xor $zero,$zero
3197 mov %r8,0*8($tptr)
3198 cmp $carry,$tptr # cf=0, of=0
3199 je .Lsqrx8x_outer_loop
3200
3201 mov %r9,1*8($tptr)
3202 mov 1*8($carry),%r9
3203 mov %r10,2*8($tptr)
3204 mov 2*8($carry),%r10
3205 mov %r11,3*8($tptr)
3206 mov 3*8($carry),%r11
3207 mov %r12,4*8($tptr)
3208 mov 4*8($carry),%r12
3209 mov %r13,5*8($tptr)
3210 mov 5*8($carry),%r13
3211 mov %r14,6*8($tptr)
3212 mov 6*8($carry),%r14
3213 mov %r15,7*8($tptr)
3214 mov 7*8($carry),%r15
3215 mov $carry,$tptr
3216 jmp .Lsqrx8x_outer_loop
3217
3218.align 32
3219.Lsqrx8x_outer_break:
3220 mov %r9,9*8($tptr) # t[9]
3221 movq %xmm3,%rcx # -$num
3222 mov %r10,10*8($tptr) # ...
3223 mov %r11,11*8($tptr)
3224 mov %r12,12*8($tptr)
3225 mov %r13,13*8($tptr)
3226 mov %r14,14*8($tptr)
3227___
3228} {
3229my $i="%rcx";
3230$code.=<<___;
3231 lea 48+8(%rsp),$tptr
3232 mov ($aptr,$i),%rdx # a[0]
3233
3234 mov 8($tptr),$A0[1] # t[1]
3235 xor $A0[0],$A0[0] # t[0], of=0, cf=0
3236 mov 0+8(%rsp),$num # restore $num
3237 adox $A0[1],$A0[1]
3238 mov 16($tptr),$A1[0] # t[2] # prefetch
3239 mov 24($tptr),$A1[1] # t[3] # prefetch
3240 #jmp .Lsqrx4x_shift_n_add # happens to be aligned
3241
3242.align 32
3243.Lsqrx4x_shift_n_add:
3244 mulx %rdx,%rax,%rbx
3245 adox $A1[0],$A1[0]
3246 adcx $A0[0],%rax
3247 .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
3248 .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
3249 adox $A1[1],$A1[1]
3250 adcx $A0[1],%rbx
3251 mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
3252 mov %rax,0($tptr)
3253 mov %rbx,8($tptr)
3254
3255 mulx %rdx,%rax,%rbx
3256 adox $A0[0],$A0[0]
3257 adcx $A1[0],%rax
3258 mov 16($aptr,$i),%rdx # a[i+2] # prefetch
3259 mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
3260 adox $A0[1],$A0[1]
3261 adcx $A1[1],%rbx
3262 mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
3263 mov %rax,16($tptr)
3264 mov %rbx,24($tptr)
3265
3266 mulx %rdx,%rax,%rbx
3267 adox $A1[0],$A1[0]
3268 adcx $A0[0],%rax
3269 mov 24($aptr,$i),%rdx # a[i+3] # prefetch
3270 lea 32($i),$i
3271 mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
3272 adox $A1[1],$A1[1]
3273 adcx $A0[1],%rbx
3274 mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
3275 mov %rax,32($tptr)
3276 mov %rbx,40($tptr)
3277
3278 mulx %rdx,%rax,%rbx
3279 adox $A0[0],$A0[0]
3280 adcx $A1[0],%rax
3281 jrcxz .Lsqrx4x_shift_n_add_break
3282 .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
3283 adox $A0[1],$A0[1]
3284 adcx $A1[1],%rbx
3285 mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
3286 mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
3287 mov %rax,48($tptr)
3288 mov %rbx,56($tptr)
3289 lea 64($tptr),$tptr
3290 nop
3291 jmp .Lsqrx4x_shift_n_add
3292
3293.align 32
3294.Lsqrx4x_shift_n_add_break:
3295 adcx $A1[1],%rbx
3296 mov %rax,48($tptr)
3297 mov %rbx,56($tptr)
3298 lea 64($tptr),$tptr # end of t[] buffer
3299___
3300}
3301######################################################################
3302# Montgomery reduction part, "word-by-word" algorithm.
3303#
3304# This new path is inspired by multiple submissions from Intel, by
3305# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3306# Vinodh Gopal...
3307{
3308my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3309
3310$code.=<<___;
3311 movq %xmm2,$nptr
David Benjamin4969cc92016-04-22 15:02:23 -04003312__bn_sqrx8x_reduction:
Adam Langleyd9e397b2015-01-22 14:27:53 -08003313 xor %eax,%eax # initial top-most carry bit
3314 mov 32+8(%rsp),%rbx # n0
3315 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
David Benjamin4969cc92016-04-22 15:02:23 -04003316 lea -8*8($nptr,$num),%rcx # end of n[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003317 #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
3318 mov %rcx, 0+8(%rsp) # save end of n[]
3319 mov $tptr,8+8(%rsp) # save end of t[]
3320
3321 lea 48+8(%rsp),$tptr # initial t[] window
3322 jmp .Lsqrx8x_reduction_loop
3323
3324.align 32
3325.Lsqrx8x_reduction_loop:
3326 mov 8*1($tptr),%r9
3327 mov 8*2($tptr),%r10
3328 mov 8*3($tptr),%r11
3329 mov 8*4($tptr),%r12
3330 mov %rdx,%r8
3331 imulq %rbx,%rdx # n0*a[i]
3332 mov 8*5($tptr),%r13
3333 mov 8*6($tptr),%r14
3334 mov 8*7($tptr),%r15
3335 mov %rax,24+8(%rsp) # store top-most carry bit
3336
3337 lea 8*8($tptr),$tptr
3338 xor $carry,$carry # cf=0,of=0
3339 mov \$-8,%rcx
3340 jmp .Lsqrx8x_reduce
3341
3342.align 32
3343.Lsqrx8x_reduce:
3344 mov %r8, %rbx
David Benjamin4969cc92016-04-22 15:02:23 -04003345 mulx 8*0($nptr),%rax,%r8 # n[0]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003346 adcx %rbx,%rax # discarded
3347 adox %r9,%r8
3348
David Benjamin4969cc92016-04-22 15:02:23 -04003349 mulx 8*1($nptr),%rbx,%r9 # n[1]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003350 adcx %rbx,%r8
3351 adox %r10,%r9
3352
David Benjamin4969cc92016-04-22 15:02:23 -04003353 mulx 8*2($nptr),%rbx,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08003354 adcx %rbx,%r9
3355 adox %r11,%r10
3356
David Benjamin4969cc92016-04-22 15:02:23 -04003357 mulx 8*3($nptr),%rbx,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08003358 adcx %rbx,%r10
3359 adox %r12,%r11
3360
David Benjamin4969cc92016-04-22 15:02:23 -04003361 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08003362 mov %rdx,%rax
3363 mov %r8,%rdx
3364 adcx %rbx,%r11
3365 adox %r13,%r12
3366
3367 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
3368 mov %rax,%rdx
3369 mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
3370
David Benjamin4969cc92016-04-22 15:02:23 -04003371 mulx 8*5($nptr),%rax,%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08003372 adcx %rax,%r12
3373 adox %r14,%r13
3374
David Benjamin4969cc92016-04-22 15:02:23 -04003375 mulx 8*6($nptr),%rax,%r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08003376 adcx %rax,%r13
3377 adox %r15,%r14
3378
David Benjamin4969cc92016-04-22 15:02:23 -04003379 mulx 8*7($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08003380 mov %rbx,%rdx
3381 adcx %rax,%r14
3382 adox $carry,%r15 # $carry is 0
3383 adcx $carry,%r15 # cf=0
3384
3385 .byte 0x67,0x67,0x67
3386 inc %rcx # of=0
3387 jnz .Lsqrx8x_reduce
3388
3389 mov $carry,%rax # xor %rax,%rax
3390 cmp 0+8(%rsp),$nptr # end of n[]?
3391 jae .Lsqrx8x_no_tail
3392
3393 mov 48+8(%rsp),%rdx # pull n0*a[0]
3394 add 8*0($tptr),%r8
David Benjamin4969cc92016-04-22 15:02:23 -04003395 lea 8*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08003396 mov \$-8,%rcx
3397 adcx 8*1($tptr),%r9
3398 adcx 8*2($tptr),%r10
3399 adc 8*3($tptr),%r11
3400 adc 8*4($tptr),%r12
3401 adc 8*5($tptr),%r13
3402 adc 8*6($tptr),%r14
3403 adc 8*7($tptr),%r15
3404 lea 8*8($tptr),$tptr
3405 sbb %rax,%rax # top carry
3406
3407 xor $carry,$carry # of=0, cf=0
3408 mov %rax,16+8(%rsp)
3409 jmp .Lsqrx8x_tail
3410
3411.align 32
3412.Lsqrx8x_tail:
3413 mov %r8,%rbx
David Benjamin4969cc92016-04-22 15:02:23 -04003414 mulx 8*0($nptr),%rax,%r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08003415 adcx %rax,%rbx
3416 adox %r9,%r8
3417
David Benjamin4969cc92016-04-22 15:02:23 -04003418 mulx 8*1($nptr),%rax,%r9
Adam Langleyd9e397b2015-01-22 14:27:53 -08003419 adcx %rax,%r8
3420 adox %r10,%r9
3421
David Benjamin4969cc92016-04-22 15:02:23 -04003422 mulx 8*2($nptr),%rax,%r10
Adam Langleyd9e397b2015-01-22 14:27:53 -08003423 adcx %rax,%r9
3424 adox %r11,%r10
3425
David Benjamin4969cc92016-04-22 15:02:23 -04003426 mulx 8*3($nptr),%rax,%r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08003427 adcx %rax,%r10
3428 adox %r12,%r11
3429
David Benjamin4969cc92016-04-22 15:02:23 -04003430 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08003431 adcx %rax,%r11
3432 adox %r13,%r12
3433
David Benjamin4969cc92016-04-22 15:02:23 -04003434 mulx 8*5($nptr),%rax,%r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08003435 adcx %rax,%r12
3436 adox %r14,%r13
3437
David Benjamin4969cc92016-04-22 15:02:23 -04003438 mulx 8*6($nptr),%rax,%r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08003439 adcx %rax,%r13
3440 adox %r15,%r14
3441
David Benjamin4969cc92016-04-22 15:02:23 -04003442 mulx 8*7($nptr),%rax,%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08003443 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
3444 adcx %rax,%r14
3445 adox $carry,%r15
3446 mov %rbx,($tptr,%rcx,8) # save result
3447 mov %r8,%rbx
3448 adcx $carry,%r15 # cf=0
3449
3450 inc %rcx # of=0
3451 jnz .Lsqrx8x_tail
3452
3453 cmp 0+8(%rsp),$nptr # end of n[]?
3454 jae .Lsqrx8x_tail_done # break out of loop
3455
3456 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3457 mov 48+8(%rsp),%rdx # pull n0*a[0]
David Benjamin4969cc92016-04-22 15:02:23 -04003458 lea 8*8($nptr),$nptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08003459 adc 8*0($tptr),%r8
3460 adc 8*1($tptr),%r9
3461 adc 8*2($tptr),%r10
3462 adc 8*3($tptr),%r11
3463 adc 8*4($tptr),%r12
3464 adc 8*5($tptr),%r13
3465 adc 8*6($tptr),%r14
3466 adc 8*7($tptr),%r15
3467 lea 8*8($tptr),$tptr
3468 sbb %rax,%rax
3469 sub \$8,%rcx # mov \$-8,%rcx
3470
3471 xor $carry,$carry # of=0, cf=0
3472 mov %rax,16+8(%rsp)
3473 jmp .Lsqrx8x_tail
3474
3475.align 32
3476.Lsqrx8x_tail_done:
Robert Sloan4d1ac502017-02-06 08:36:14 -08003477 xor %rax,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08003478 add 24+8(%rsp),%r8 # can this overflow?
Adam Langley4139edb2016-01-13 15:00:54 -08003479 adc \$0,%r9
3480 adc \$0,%r10
3481 adc \$0,%r11
3482 adc \$0,%r12
3483 adc \$0,%r13
3484 adc \$0,%r14
Robert Sloan4d1ac502017-02-06 08:36:14 -08003485 adc \$0,%r15
3486 adc \$0,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08003487
3488 sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3489.Lsqrx8x_no_tail: # %cf is 0 if jumped here
3490 adc 8*0($tptr),%r8
3491 movq %xmm3,%rcx
3492 adc 8*1($tptr),%r9
David Benjamin4969cc92016-04-22 15:02:23 -04003493 mov 8*7($nptr),$carry
Adam Langleyd9e397b2015-01-22 14:27:53 -08003494 movq %xmm2,$nptr # restore $nptr
3495 adc 8*2($tptr),%r10
3496 adc 8*3($tptr),%r11
3497 adc 8*4($tptr),%r12
3498 adc 8*5($tptr),%r13
3499 adc 8*6($tptr),%r14
3500 adc 8*7($tptr),%r15
Robert Sloan4d1ac502017-02-06 08:36:14 -08003501 adc \$0,%rax # top-most carry
Adam Langleyd9e397b2015-01-22 14:27:53 -08003502
3503 mov 32+8(%rsp),%rbx # n0
3504 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
3505
3506 mov %r8,8*0($tptr) # store top 512 bits
3507 lea 8*8($tptr),%r8 # borrow %r8
3508 mov %r9,8*1($tptr)
3509 mov %r10,8*2($tptr)
3510 mov %r11,8*3($tptr)
3511 mov %r12,8*4($tptr)
3512 mov %r13,8*5($tptr)
3513 mov %r14,8*6($tptr)
3514 mov %r15,8*7($tptr)
3515
3516 lea 8*8($tptr,%rcx),$tptr # start of current t[] window
3517 cmp 8+8(%rsp),%r8 # end of t[]?
3518 jb .Lsqrx8x_reduction_loop
David Benjamin4969cc92016-04-22 15:02:23 -04003519 ret
3520.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08003521___
3522}
3523##############################################################
3524# Post-condition, 4x unrolled
3525#
3526{
3527my ($rptr,$nptr)=("%rdx","%rbp");
Adam Langleyd9e397b2015-01-22 14:27:53 -08003528$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04003529.align 32
3530__bn_postx4x_internal:
3531 mov 8*0($nptr),%r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08003532 mov %rcx,%r10 # -$num
Adam Langleyd9e397b2015-01-22 14:27:53 -08003533 mov %rcx,%r9 # -$num
David Benjamin4969cc92016-04-22 15:02:23 -04003534 neg %rax
3535 sar \$3+2,%rcx
Adam Langleyd9e397b2015-01-22 14:27:53 -08003536 #lea 48+8(%rsp,%r9),$tptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08003537 movq %xmm1,$rptr # restore $rptr
3538 movq %xmm1,$aptr # prepare for back-to-back call
David Benjamin4969cc92016-04-22 15:02:23 -04003539 dec %r12 # so that after 'not' we get -n[0]
3540 mov 8*1($nptr),%r13
3541 xor %r8,%r8
3542 mov 8*2($nptr),%r14
3543 mov 8*3($nptr),%r15
3544 jmp .Lsqrx4x_sub_entry
Adam Langleyd9e397b2015-01-22 14:27:53 -08003545
David Benjamin4969cc92016-04-22 15:02:23 -04003546.align 16
Adam Langleyd9e397b2015-01-22 14:27:53 -08003547.Lsqrx4x_sub:
David Benjamin4969cc92016-04-22 15:02:23 -04003548 mov 8*0($nptr),%r12
3549 mov 8*1($nptr),%r13
3550 mov 8*2($nptr),%r14
3551 mov 8*3($nptr),%r15
3552.Lsqrx4x_sub_entry:
3553 andn %rax,%r12,%r12
3554 lea 8*4($nptr),$nptr
3555 andn %rax,%r13,%r13
3556 andn %rax,%r14,%r14
3557 andn %rax,%r15,%r15
3558
3559 neg %r8 # mov %r8,%cf
3560 adc 8*0($tptr),%r12
3561 adc 8*1($tptr),%r13
3562 adc 8*2($tptr),%r14
3563 adc 8*3($tptr),%r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08003564 mov %r12,8*0($rptr)
David Benjamin4969cc92016-04-22 15:02:23 -04003565 lea 8*4($tptr),$tptr
Adam Langleyd9e397b2015-01-22 14:27:53 -08003566 mov %r13,8*1($rptr)
David Benjamin4969cc92016-04-22 15:02:23 -04003567 sbb %r8,%r8 # mov %cf,%r8
Adam Langleyd9e397b2015-01-22 14:27:53 -08003568 mov %r14,8*2($rptr)
3569 mov %r15,8*3($rptr)
3570 lea 8*4($rptr),$rptr
3571
3572 inc %rcx
3573 jnz .Lsqrx4x_sub
David Benjamin4969cc92016-04-22 15:02:23 -04003574
Adam Langleyd9e397b2015-01-22 14:27:53 -08003575 neg %r9 # restore $num
3576
3577 ret
David Benjamin4969cc92016-04-22 15:02:23 -04003578.size __bn_postx4x_internal,.-__bn_postx4x_internal
Adam Langleyd9e397b2015-01-22 14:27:53 -08003579___
David Benjamin4969cc92016-04-22 15:02:23 -04003580}
Adam Langleyd9e397b2015-01-22 14:27:53 -08003581}}}
3582{
3583my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3584 ("%rdi","%esi","%rdx","%ecx"); # Unix order
3585my $out=$inp;
3586my $STRIDE=2**5*8;
3587my $N=$STRIDE/4;
3588
3589$code.=<<___;
3590.globl bn_scatter5
3591.type bn_scatter5,\@abi-omnipotent
3592.align 16
3593bn_scatter5:
3594 cmp \$0, $num
3595 jz .Lscatter_epilogue
3596 lea ($tbl,$idx,8),$tbl
3597.Lscatter:
3598 mov ($inp),%rax
3599 lea 8($inp),$inp
3600 mov %rax,($tbl)
3601 lea 32*8($tbl),$tbl
3602 sub \$1,$num
3603 jnz .Lscatter
3604.Lscatter_epilogue:
3605 ret
3606.size bn_scatter5,.-bn_scatter5
3607
3608.globl bn_gather5
3609.type bn_gather5,\@abi-omnipotent
David Benjamin4969cc92016-04-22 15:02:23 -04003610.align 32
Adam Langleyd9e397b2015-01-22 14:27:53 -08003611bn_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04003612.LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases
Adam Langleyd9e397b2015-01-22 14:27:53 -08003613 # I can't trust assembler to use specific encoding:-(
David Benjamin4969cc92016-04-22 15:02:23 -04003614 .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10
3615 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp
3616 lea .Linc(%rip),%rax
3617 and \$-16,%rsp # shouldn't be formally required
3618
3619 movd $idx,%xmm5
3620 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000
3621 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002
3622 lea 128($tbl),%r11 # size optimization
3623 lea 128(%rsp),%rax # size optimization
3624
3625 pshufd \$0,%xmm5,%xmm5 # broadcast $idx
3626 movdqa %xmm1,%xmm4
3627 movdqa %xmm1,%xmm2
3628___
3629########################################################################
3630# calculate mask by comparing 0..31 to $idx and save result to stack
3631#
3632for($i=0;$i<$STRIDE/16;$i+=4) {
3633$code.=<<___;
3634 paddd %xmm0,%xmm1
3635 pcmpeqd %xmm5,%xmm0 # compare to 1,0
3636___
3637$code.=<<___ if ($i);
3638 movdqa %xmm3,`16*($i-1)-128`(%rax)
Adam Langleyd9e397b2015-01-22 14:27:53 -08003639___
3640$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04003641 movdqa %xmm4,%xmm3
Adam Langleyd9e397b2015-01-22 14:27:53 -08003642
David Benjamin4969cc92016-04-22 15:02:23 -04003643 paddd %xmm1,%xmm2
3644 pcmpeqd %xmm5,%xmm1 # compare to 3,2
3645 movdqa %xmm0,`16*($i+0)-128`(%rax)
3646 movdqa %xmm4,%xmm0
3647
3648 paddd %xmm2,%xmm3
3649 pcmpeqd %xmm5,%xmm2 # compare to 5,4
3650 movdqa %xmm1,`16*($i+1)-128`(%rax)
3651 movdqa %xmm4,%xmm1
3652
3653 paddd %xmm3,%xmm0
3654 pcmpeqd %xmm5,%xmm3 # compare to 7,6
3655 movdqa %xmm2,`16*($i+2)-128`(%rax)
3656 movdqa %xmm4,%xmm2
3657___
3658}
3659$code.=<<___;
3660 movdqa %xmm3,`16*($i-1)-128`(%rax)
3661 jmp .Lgather
3662
3663.align 32
3664.Lgather:
3665 pxor %xmm4,%xmm4
3666 pxor %xmm5,%xmm5
3667___
3668for($i=0;$i<$STRIDE/16;$i+=4) {
3669$code.=<<___;
3670 movdqa `16*($i+0)-128`(%r11),%xmm0
3671 movdqa `16*($i+1)-128`(%r11),%xmm1
3672 movdqa `16*($i+2)-128`(%r11),%xmm2
3673 pand `16*($i+0)-128`(%rax),%xmm0
3674 movdqa `16*($i+3)-128`(%r11),%xmm3
3675 pand `16*($i+1)-128`(%rax),%xmm1
3676 por %xmm0,%xmm4
3677 pand `16*($i+2)-128`(%rax),%xmm2
3678 por %xmm1,%xmm5
3679 pand `16*($i+3)-128`(%rax),%xmm3
3680 por %xmm2,%xmm4
3681 por %xmm3,%xmm5
3682___
3683}
3684$code.=<<___;
3685 por %xmm5,%xmm4
3686 lea $STRIDE(%r11),%r11
3687 pshufd \$0x4e,%xmm4,%xmm0
3688 por %xmm4,%xmm0
Adam Langleyd9e397b2015-01-22 14:27:53 -08003689 movq %xmm0,($out) # m0=bp[0]
3690 lea 8($out),$out
3691 sub \$1,$num
3692 jnz .Lgather
David Benjamin4969cc92016-04-22 15:02:23 -04003693
3694 lea (%r10),%rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08003695 ret
3696.LSEH_end_bn_gather5:
3697.size bn_gather5,.-bn_gather5
3698___
3699}
3700$code.=<<___;
3701.align 64
David Benjamin4969cc92016-04-22 15:02:23 -04003702.Linc:
3703 .long 0,0, 1,1
3704 .long 2,2, 2,2
Adam Langleyd9e397b2015-01-22 14:27:53 -08003705.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3706___
3707
3708# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3709# CONTEXT *context,DISPATCHER_CONTEXT *disp)
3710if ($win64) {
3711$rec="%rcx";
3712$frame="%rdx";
3713$context="%r8";
3714$disp="%r9";
3715
3716$code.=<<___;
3717.extern __imp_RtlVirtualUnwind
3718.type mul_handler,\@abi-omnipotent
3719.align 16
3720mul_handler:
3721 push %rsi
3722 push %rdi
3723 push %rbx
3724 push %rbp
3725 push %r12
3726 push %r13
3727 push %r14
3728 push %r15
3729 pushfq
3730 sub \$64,%rsp
3731
3732 mov 120($context),%rax # pull context->Rax
3733 mov 248($context),%rbx # pull context->Rip
3734
3735 mov 8($disp),%rsi # disp->ImageBase
3736 mov 56($disp),%r11 # disp->HandlerData
3737
3738 mov 0(%r11),%r10d # HandlerData[0]
3739 lea (%rsi,%r10),%r10 # end of prologue label
3740 cmp %r10,%rbx # context->Rip<end of prologue label
3741 jb .Lcommon_seh_tail
3742
Robert Sloana94fe052017-02-21 08:49:28 -08003743 mov 4(%r11),%r10d # HandlerData[1]
3744 lea (%rsi,%r10),%r10 # beginning of body label
3745 cmp %r10,%rbx # context->Rip<body label
3746 jb .Lcommon_pop_regs
3747
Adam Langleyd9e397b2015-01-22 14:27:53 -08003748 mov 152($context),%rax # pull context->Rsp
3749
Robert Sloana94fe052017-02-21 08:49:28 -08003750 mov 8(%r11),%r10d # HandlerData[2]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003751 lea (%rsi,%r10),%r10 # epilogue label
3752 cmp %r10,%rbx # context->Rip>=epilogue label
3753 jae .Lcommon_seh_tail
3754
3755 lea .Lmul_epilogue(%rip),%r10
3756 cmp %r10,%rbx
David Benjamin4969cc92016-04-22 15:02:23 -04003757 ja .Lbody_40
Adam Langleyd9e397b2015-01-22 14:27:53 -08003758
3759 mov 192($context),%r10 # pull $num
3760 mov 8(%rax,%r10,8),%rax # pull saved stack pointer
David Benjamin4969cc92016-04-22 15:02:23 -04003761
Robert Sloana94fe052017-02-21 08:49:28 -08003762 jmp .Lcommon_pop_regs
Adam Langleyd9e397b2015-01-22 14:27:53 -08003763
3764.Lbody_40:
3765 mov 40(%rax),%rax # pull saved stack pointer
Robert Sloana94fe052017-02-21 08:49:28 -08003766.Lcommon_pop_regs:
Adam Langleyd9e397b2015-01-22 14:27:53 -08003767 mov -8(%rax),%rbx
3768 mov -16(%rax),%rbp
3769 mov -24(%rax),%r12
3770 mov -32(%rax),%r13
3771 mov -40(%rax),%r14
3772 mov -48(%rax),%r15
3773 mov %rbx,144($context) # restore context->Rbx
3774 mov %rbp,160($context) # restore context->Rbp
3775 mov %r12,216($context) # restore context->R12
3776 mov %r13,224($context) # restore context->R13
3777 mov %r14,232($context) # restore context->R14
3778 mov %r15,240($context) # restore context->R15
Adam Langleyd9e397b2015-01-22 14:27:53 -08003779
3780.Lcommon_seh_tail:
3781 mov 8(%rax),%rdi
3782 mov 16(%rax),%rsi
3783 mov %rax,152($context) # restore context->Rsp
3784 mov %rsi,168($context) # restore context->Rsi
3785 mov %rdi,176($context) # restore context->Rdi
3786
3787 mov 40($disp),%rdi # disp->ContextRecord
3788 mov $context,%rsi # context
3789 mov \$154,%ecx # sizeof(CONTEXT)
3790 .long 0xa548f3fc # cld; rep movsq
3791
3792 mov $disp,%rsi
3793 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3794 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3795 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3796 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3797 mov 40(%rsi),%r10 # disp->ContextRecord
3798 lea 56(%rsi),%r11 # &disp->HandlerData
3799 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3800 mov %r10,32(%rsp) # arg5
3801 mov %r11,40(%rsp) # arg6
3802 mov %r12,48(%rsp) # arg7
3803 mov %rcx,56(%rsp) # arg8, (NULL)
3804 call *__imp_RtlVirtualUnwind(%rip)
3805
3806 mov \$1,%eax # ExceptionContinueSearch
3807 add \$64,%rsp
3808 popfq
3809 pop %r15
3810 pop %r14
3811 pop %r13
3812 pop %r12
3813 pop %rbp
3814 pop %rbx
3815 pop %rdi
3816 pop %rsi
3817 ret
3818.size mul_handler,.-mul_handler
3819
3820.section .pdata
3821.align 4
3822 .rva .LSEH_begin_bn_mul_mont_gather5
3823 .rva .LSEH_end_bn_mul_mont_gather5
3824 .rva .LSEH_info_bn_mul_mont_gather5
3825
3826 .rva .LSEH_begin_bn_mul4x_mont_gather5
3827 .rva .LSEH_end_bn_mul4x_mont_gather5
3828 .rva .LSEH_info_bn_mul4x_mont_gather5
3829
3830 .rva .LSEH_begin_bn_power5
3831 .rva .LSEH_end_bn_power5
3832 .rva .LSEH_info_bn_power5
3833
3834 .rva .LSEH_begin_bn_from_mont8x
3835 .rva .LSEH_end_bn_from_mont8x
3836 .rva .LSEH_info_bn_from_mont8x
3837___
3838$code.=<<___ if ($addx);
3839 .rva .LSEH_begin_bn_mulx4x_mont_gather5
3840 .rva .LSEH_end_bn_mulx4x_mont_gather5
3841 .rva .LSEH_info_bn_mulx4x_mont_gather5
3842
3843 .rva .LSEH_begin_bn_powerx5
3844 .rva .LSEH_end_bn_powerx5
3845 .rva .LSEH_info_bn_powerx5
3846___
3847$code.=<<___;
3848 .rva .LSEH_begin_bn_gather5
3849 .rva .LSEH_end_bn_gather5
3850 .rva .LSEH_info_bn_gather5
3851
3852.section .xdata
3853.align 8
3854.LSEH_info_bn_mul_mont_gather5:
3855 .byte 9,0,0,0
3856 .rva mul_handler
Robert Sloana94fe052017-02-21 08:49:28 -08003857 .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003858.align 8
3859.LSEH_info_bn_mul4x_mont_gather5:
3860 .byte 9,0,0,0
3861 .rva mul_handler
Robert Sloana94fe052017-02-21 08:49:28 -08003862 .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003863.align 8
3864.LSEH_info_bn_power5:
3865 .byte 9,0,0,0
3866 .rva mul_handler
Robert Sloana94fe052017-02-21 08:49:28 -08003867 .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003868.align 8
3869.LSEH_info_bn_from_mont8x:
3870 .byte 9,0,0,0
3871 .rva mul_handler
Robert Sloana94fe052017-02-21 08:49:28 -08003872 .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003873___
3874$code.=<<___ if ($addx);
3875.align 8
3876.LSEH_info_bn_mulx4x_mont_gather5:
3877 .byte 9,0,0,0
3878 .rva mul_handler
Robert Sloana94fe052017-02-21 08:49:28 -08003879 .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003880.align 8
3881.LSEH_info_bn_powerx5:
3882 .byte 9,0,0,0
3883 .rva mul_handler
Robert Sloana94fe052017-02-21 08:49:28 -08003884 .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
Adam Langleyd9e397b2015-01-22 14:27:53 -08003885___
3886$code.=<<___;
3887.align 8
3888.LSEH_info_bn_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04003889 .byte 0x01,0x0b,0x03,0x0a
3890 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
3891 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08003892.align 8
3893___
3894}
3895
3896$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3897
3898print $code;
3899close STDOUT;