blob: 14a28dcd182a1915b0cbd6d2fc484b14cfd4dafd [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3##############################################################################
4# #
5# Copyright (c) 2012, Intel Corporation #
6# #
7# All rights reserved. #
8# #
9# Redistribution and use in source and binary forms, with or without #
10# modification, are permitted provided that the following conditions are #
11# met: #
12# #
13# * Redistributions of source code must retain the above copyright #
14# notice, this list of conditions and the following disclaimer. #
15# #
16# * Redistributions in binary form must reproduce the above copyright #
17# notice, this list of conditions and the following disclaimer in the #
18# documentation and/or other materials provided with the #
19# distribution. #
20# #
21# * Neither the name of the Intel Corporation nor the names of its #
22# contributors may be used to endorse or promote products derived from #
23# this software without specific prior written permission. #
24# #
25# #
26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37# #
38##############################################################################
39# Developers and authors: #
40# Shay Gueron (1, 2), and Vlad Krasnov (1) #
41# (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42# (2) University of Haifa, Israel #
43##############################################################################
44# Reference: #
45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46# Exponentiation, Using Advanced Vector Instructions Architectures", #
47# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49# [2] S. Gueron: "Efficient Software Implementations of Modular #
50# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52# Proceedings of 9th International Conference on Information Technology: #
53# New Generations (ITNG 2012), pp.821-823 (2012) #
54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55# resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56# on AVX2 capable x86_64 platforms", #
57# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58##############################################################################
59#
60# +13% improvement over original submission by <appro@openssl.org>
61#
62# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63# 2.3GHz Haswell 621 765/+23% 1113/+79%
64# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
65#
66# (*) if system doesn't support AVX2, for reference purposes;
67# (**) scaled to 2.3GHz to simplify comparison;
68# (***) scalar AD*X code is faster than AVX2 and is preferred code
69# path for Broadwell;
70
71$flavour = shift;
72$output = shift;
73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan8ff03552017-06-14 12:40:58 -070079( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -080080die "can't locate x86_64-xlate.pl";
81
Kenny Roote99801b2015-11-06 15:31:15 -080082# In upstream, this is controlled by shelling out to the compiler to check
83# versions, but BoringSSL is intended to be used with pre-generated perlasm
84# output, so this isn't useful anyway.
85#
86# TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
Robert Sloan1c9db532017-03-13 08:03:59 -070087$avx = 2;
88$addx = 1;
Adam Langleyd9e397b2015-01-22 14:27:53 -080089
David Benjaminc895d6b2016-08-11 13:26:41 -040090open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -080091*STDOUT = *OUT;
92
93if ($avx>1) {{{
94{ # void AMS_WW(
95my $rp="%rdi"; # BN_ULONG *rp,
96my $ap="%rsi"; # const BN_ULONG *ap,
97my $np="%rdx"; # const BN_ULONG *np,
98my $n0="%ecx"; # const BN_ULONG n0,
99my $rep="%r8d"; # int repeat);
100
101# The registers that hold the accumulated redundant result
102# The AMM works on 1024 bit operands, and redundant word size is 29
103# Therefore: ceil(1024/29)/4 = 9
104my $ACC0="%ymm0";
105my $ACC1="%ymm1";
106my $ACC2="%ymm2";
107my $ACC3="%ymm3";
108my $ACC4="%ymm4";
109my $ACC5="%ymm5";
110my $ACC6="%ymm6";
111my $ACC7="%ymm7";
112my $ACC8="%ymm8";
113my $ACC9="%ymm9";
114# Registers that hold the broadcasted words of bp, currently used
115my $B1="%ymm10";
116my $B2="%ymm11";
117# Registers that hold the broadcasted words of Y, currently used
118my $Y1="%ymm12";
119my $Y2="%ymm13";
120# Helper registers
121my $TEMP1="%ymm14";
122my $AND_MASK="%ymm15";
123# alu registers that hold the first words of the ACC
124my $r0="%r9";
125my $r1="%r10";
126my $r2="%r11";
127my $r3="%r12";
128
129my $i="%r14d"; # loop counter
130my $tmp = "%r15";
131
132my $FrameSize=32*18+32*8; # place for A^2 and 2*A
133
134my $aap=$r0;
135my $tp0="%rbx";
136my $tp1=$r3;
137my $tpa=$tmp;
138
139$np="%r13"; # reassigned argument
140
141$code.=<<___;
142.text
143
144.globl rsaz_1024_sqr_avx2
145.type rsaz_1024_sqr_avx2,\@function,5
146.align 64
147rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
Robert Sloana94fe052017-02-21 08:49:28 -0800148.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800149 lea (%rsp), %rax
Robert Sloana94fe052017-02-21 08:49:28 -0800150.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800151 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800152.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800153 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800154.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800155 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800156.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800157 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800158.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800159 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800160.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800161 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800162.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800163 vzeroupper
164___
165$code.=<<___ if ($win64);
166 lea -0xa8(%rsp),%rsp
167 vmovaps %xmm6,-0xd8(%rax)
168 vmovaps %xmm7,-0xc8(%rax)
169 vmovaps %xmm8,-0xb8(%rax)
170 vmovaps %xmm9,-0xa8(%rax)
171 vmovaps %xmm10,-0x98(%rax)
172 vmovaps %xmm11,-0x88(%rax)
173 vmovaps %xmm12,-0x78(%rax)
174 vmovaps %xmm13,-0x68(%rax)
175 vmovaps %xmm14,-0x58(%rax)
176 vmovaps %xmm15,-0x48(%rax)
177.Lsqr_1024_body:
178___
179$code.=<<___;
180 mov %rax,%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800181.cfi_def_cfa_register %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800182 mov %rdx, $np # reassigned argument
183 sub \$$FrameSize, %rsp
184 mov $np, $tmp
185 sub \$-128, $rp # size optimization
186 sub \$-128, $ap
187 sub \$-128, $np
188
189 and \$4095, $tmp # see if $np crosses page
190 add \$32*10, $tmp
191 shr \$12, $tmp
192 vpxor $ACC9,$ACC9,$ACC9
193 jz .Lsqr_1024_no_n_copy
194
195 # unaligned 256-bit load that crosses page boundary can
196 # cause >2x performance degradation here, so if $np does
197 # cross page boundary, copy it to stack and make sure stack
198 # frame doesn't...
199 sub \$32*10,%rsp
200 vmovdqu 32*0-128($np), $ACC0
201 and \$-2048, %rsp
202 vmovdqu 32*1-128($np), $ACC1
203 vmovdqu 32*2-128($np), $ACC2
204 vmovdqu 32*3-128($np), $ACC3
205 vmovdqu 32*4-128($np), $ACC4
206 vmovdqu 32*5-128($np), $ACC5
207 vmovdqu 32*6-128($np), $ACC6
208 vmovdqu 32*7-128($np), $ACC7
209 vmovdqu 32*8-128($np), $ACC8
210 lea $FrameSize+128(%rsp),$np
211 vmovdqu $ACC0, 32*0-128($np)
212 vmovdqu $ACC1, 32*1-128($np)
213 vmovdqu $ACC2, 32*2-128($np)
214 vmovdqu $ACC3, 32*3-128($np)
215 vmovdqu $ACC4, 32*4-128($np)
216 vmovdqu $ACC5, 32*5-128($np)
217 vmovdqu $ACC6, 32*6-128($np)
218 vmovdqu $ACC7, 32*7-128($np)
219 vmovdqu $ACC8, 32*8-128($np)
220 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
221
222.Lsqr_1024_no_n_copy:
223 and \$-1024, %rsp
224
225 vmovdqu 32*1-128($ap), $ACC1
226 vmovdqu 32*2-128($ap), $ACC2
227 vmovdqu 32*3-128($ap), $ACC3
228 vmovdqu 32*4-128($ap), $ACC4
229 vmovdqu 32*5-128($ap), $ACC5
230 vmovdqu 32*6-128($ap), $ACC6
231 vmovdqu 32*7-128($ap), $ACC7
232 vmovdqu 32*8-128($ap), $ACC8
233
234 lea 192(%rsp), $tp0 # 64+128=192
235 vpbroadcastq .Land_mask(%rip), $AND_MASK
236 jmp .LOOP_GRANDE_SQR_1024
237
238.align 32
239.LOOP_GRANDE_SQR_1024:
240 lea 32*18+128(%rsp), $aap # size optimization
241 lea 448(%rsp), $tp1 # 64+128+256=448
242
243 # the squaring is performed as described in Variant B of
244 # "Speeding up Big-Number Squaring", so start by calculating
245 # the A*2=A+A vector
246 vpaddq $ACC1, $ACC1, $ACC1
247 vpbroadcastq 32*0-128($ap), $B1
248 vpaddq $ACC2, $ACC2, $ACC2
249 vmovdqa $ACC1, 32*0-128($aap)
250 vpaddq $ACC3, $ACC3, $ACC3
251 vmovdqa $ACC2, 32*1-128($aap)
252 vpaddq $ACC4, $ACC4, $ACC4
253 vmovdqa $ACC3, 32*2-128($aap)
254 vpaddq $ACC5, $ACC5, $ACC5
255 vmovdqa $ACC4, 32*3-128($aap)
256 vpaddq $ACC6, $ACC6, $ACC6
257 vmovdqa $ACC5, 32*4-128($aap)
258 vpaddq $ACC7, $ACC7, $ACC7
259 vmovdqa $ACC6, 32*5-128($aap)
260 vpaddq $ACC8, $ACC8, $ACC8
261 vmovdqa $ACC7, 32*6-128($aap)
262 vpxor $ACC9, $ACC9, $ACC9
263 vmovdqa $ACC8, 32*7-128($aap)
264
265 vpmuludq 32*0-128($ap), $B1, $ACC0
266 vpbroadcastq 32*1-128($ap), $B2
267 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
268 vpmuludq $B1, $ACC1, $ACC1
269 vmovdqu $ACC9, 32*10-448($tp1)
270 vpmuludq $B1, $ACC2, $ACC2
271 vmovdqu $ACC9, 32*11-448($tp1)
272 vpmuludq $B1, $ACC3, $ACC3
273 vmovdqu $ACC9, 32*12-448($tp1)
274 vpmuludq $B1, $ACC4, $ACC4
275 vmovdqu $ACC9, 32*13-448($tp1)
276 vpmuludq $B1, $ACC5, $ACC5
277 vmovdqu $ACC9, 32*14-448($tp1)
278 vpmuludq $B1, $ACC6, $ACC6
279 vmovdqu $ACC9, 32*15-448($tp1)
280 vpmuludq $B1, $ACC7, $ACC7
281 vmovdqu $ACC9, 32*16-448($tp1)
282 vpmuludq $B1, $ACC8, $ACC8
283 vpbroadcastq 32*2-128($ap), $B1
284 vmovdqu $ACC9, 32*17-448($tp1)
285
286 mov $ap, $tpa
287 mov \$4, $i
288 jmp .Lsqr_entry_1024
289___
290$TEMP0=$Y1;
291$TEMP2=$Y2;
292$code.=<<___;
293.align 32
294.LOOP_SQR_1024:
295 vpbroadcastq 32*1-128($tpa), $B2
296 vpmuludq 32*0-128($ap), $B1, $ACC0
297 vpaddq 32*0-192($tp0), $ACC0, $ACC0
298 vpmuludq 32*0-128($aap), $B1, $ACC1
299 vpaddq 32*1-192($tp0), $ACC1, $ACC1
300 vpmuludq 32*1-128($aap), $B1, $ACC2
301 vpaddq 32*2-192($tp0), $ACC2, $ACC2
302 vpmuludq 32*2-128($aap), $B1, $ACC3
303 vpaddq 32*3-192($tp0), $ACC3, $ACC3
304 vpmuludq 32*3-128($aap), $B1, $ACC4
305 vpaddq 32*4-192($tp0), $ACC4, $ACC4
306 vpmuludq 32*4-128($aap), $B1, $ACC5
307 vpaddq 32*5-192($tp0), $ACC5, $ACC5
308 vpmuludq 32*5-128($aap), $B1, $ACC6
309 vpaddq 32*6-192($tp0), $ACC6, $ACC6
310 vpmuludq 32*6-128($aap), $B1, $ACC7
311 vpaddq 32*7-192($tp0), $ACC7, $ACC7
312 vpmuludq 32*7-128($aap), $B1, $ACC8
313 vpbroadcastq 32*2-128($tpa), $B1
314 vpaddq 32*8-192($tp0), $ACC8, $ACC8
315.Lsqr_entry_1024:
316 vmovdqu $ACC0, 32*0-192($tp0)
317 vmovdqu $ACC1, 32*1-192($tp0)
318
319 vpmuludq 32*1-128($ap), $B2, $TEMP0
320 vpaddq $TEMP0, $ACC2, $ACC2
321 vpmuludq 32*1-128($aap), $B2, $TEMP1
322 vpaddq $TEMP1, $ACC3, $ACC3
323 vpmuludq 32*2-128($aap), $B2, $TEMP2
324 vpaddq $TEMP2, $ACC4, $ACC4
325 vpmuludq 32*3-128($aap), $B2, $TEMP0
326 vpaddq $TEMP0, $ACC5, $ACC5
327 vpmuludq 32*4-128($aap), $B2, $TEMP1
328 vpaddq $TEMP1, $ACC6, $ACC6
329 vpmuludq 32*5-128($aap), $B2, $TEMP2
330 vpaddq $TEMP2, $ACC7, $ACC7
331 vpmuludq 32*6-128($aap), $B2, $TEMP0
332 vpaddq $TEMP0, $ACC8, $ACC8
333 vpmuludq 32*7-128($aap), $B2, $ACC0
334 vpbroadcastq 32*3-128($tpa), $B2
335 vpaddq 32*9-192($tp0), $ACC0, $ACC0
336
337 vmovdqu $ACC2, 32*2-192($tp0)
338 vmovdqu $ACC3, 32*3-192($tp0)
339
340 vpmuludq 32*2-128($ap), $B1, $TEMP2
341 vpaddq $TEMP2, $ACC4, $ACC4
342 vpmuludq 32*2-128($aap), $B1, $TEMP0
343 vpaddq $TEMP0, $ACC5, $ACC5
344 vpmuludq 32*3-128($aap), $B1, $TEMP1
345 vpaddq $TEMP1, $ACC6, $ACC6
346 vpmuludq 32*4-128($aap), $B1, $TEMP2
347 vpaddq $TEMP2, $ACC7, $ACC7
348 vpmuludq 32*5-128($aap), $B1, $TEMP0
349 vpaddq $TEMP0, $ACC8, $ACC8
350 vpmuludq 32*6-128($aap), $B1, $TEMP1
351 vpaddq $TEMP1, $ACC0, $ACC0
352 vpmuludq 32*7-128($aap), $B1, $ACC1
353 vpbroadcastq 32*4-128($tpa), $B1
354 vpaddq 32*10-448($tp1), $ACC1, $ACC1
355
356 vmovdqu $ACC4, 32*4-192($tp0)
357 vmovdqu $ACC5, 32*5-192($tp0)
358
359 vpmuludq 32*3-128($ap), $B2, $TEMP0
360 vpaddq $TEMP0, $ACC6, $ACC6
361 vpmuludq 32*3-128($aap), $B2, $TEMP1
362 vpaddq $TEMP1, $ACC7, $ACC7
363 vpmuludq 32*4-128($aap), $B2, $TEMP2
364 vpaddq $TEMP2, $ACC8, $ACC8
365 vpmuludq 32*5-128($aap), $B2, $TEMP0
366 vpaddq $TEMP0, $ACC0, $ACC0
367 vpmuludq 32*6-128($aap), $B2, $TEMP1
368 vpaddq $TEMP1, $ACC1, $ACC1
369 vpmuludq 32*7-128($aap), $B2, $ACC2
370 vpbroadcastq 32*5-128($tpa), $B2
Robert Sloana94fe052017-02-21 08:49:28 -0800371 vpaddq 32*11-448($tp1), $ACC2, $ACC2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800372
373 vmovdqu $ACC6, 32*6-192($tp0)
374 vmovdqu $ACC7, 32*7-192($tp0)
375
376 vpmuludq 32*4-128($ap), $B1, $TEMP0
377 vpaddq $TEMP0, $ACC8, $ACC8
378 vpmuludq 32*4-128($aap), $B1, $TEMP1
379 vpaddq $TEMP1, $ACC0, $ACC0
380 vpmuludq 32*5-128($aap), $B1, $TEMP2
381 vpaddq $TEMP2, $ACC1, $ACC1
382 vpmuludq 32*6-128($aap), $B1, $TEMP0
383 vpaddq $TEMP0, $ACC2, $ACC2
384 vpmuludq 32*7-128($aap), $B1, $ACC3
385 vpbroadcastq 32*6-128($tpa), $B1
386 vpaddq 32*12-448($tp1), $ACC3, $ACC3
387
388 vmovdqu $ACC8, 32*8-192($tp0)
389 vmovdqu $ACC0, 32*9-192($tp0)
390 lea 8($tp0), $tp0
391
392 vpmuludq 32*5-128($ap), $B2, $TEMP2
393 vpaddq $TEMP2, $ACC1, $ACC1
394 vpmuludq 32*5-128($aap), $B2, $TEMP0
395 vpaddq $TEMP0, $ACC2, $ACC2
396 vpmuludq 32*6-128($aap), $B2, $TEMP1
397 vpaddq $TEMP1, $ACC3, $ACC3
398 vpmuludq 32*7-128($aap), $B2, $ACC4
399 vpbroadcastq 32*7-128($tpa), $B2
400 vpaddq 32*13-448($tp1), $ACC4, $ACC4
401
402 vmovdqu $ACC1, 32*10-448($tp1)
403 vmovdqu $ACC2, 32*11-448($tp1)
404
405 vpmuludq 32*6-128($ap), $B1, $TEMP0
406 vpaddq $TEMP0, $ACC3, $ACC3
407 vpmuludq 32*6-128($aap), $B1, $TEMP1
408 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
409 vpaddq $TEMP1, $ACC4, $ACC4
410 vpmuludq 32*7-128($aap), $B1, $ACC5
411 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
412 vpaddq 32*14-448($tp1), $ACC5, $ACC5
413
414 vmovdqu $ACC3, 32*12-448($tp1)
415 vmovdqu $ACC4, 32*13-448($tp1)
416 lea 8($tpa), $tpa
417
418 vpmuludq 32*7-128($ap), $B2, $TEMP0
419 vpaddq $TEMP0, $ACC5, $ACC5
420 vpmuludq 32*7-128($aap), $B2, $ACC6
421 vpaddq 32*15-448($tp1), $ACC6, $ACC6
422
423 vpmuludq 32*8-128($ap), $ACC0, $ACC7
424 vmovdqu $ACC5, 32*14-448($tp1)
425 vpaddq 32*16-448($tp1), $ACC7, $ACC7
426 vmovdqu $ACC6, 32*15-448($tp1)
427 vmovdqu $ACC7, 32*16-448($tp1)
428 lea 8($tp1), $tp1
429
Robert Sloana94fe052017-02-21 08:49:28 -0800430 dec $i
Adam Langleyd9e397b2015-01-22 14:27:53 -0800431 jnz .LOOP_SQR_1024
432___
433$ZERO = $ACC9;
434$TEMP0 = $B1;
435$TEMP2 = $B2;
436$TEMP3 = $Y1;
437$TEMP4 = $Y2;
438$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -0400439 # we need to fix indices 32-39 to avoid overflow
Adam Langleyd9e397b2015-01-22 14:27:53 -0800440 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
441 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
442 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
443 lea 192(%rsp), $tp0 # 64+128=192
444
445 vpsrlq \$29, $ACC8, $TEMP1
446 vpand $AND_MASK, $ACC8, $ACC8
447 vpsrlq \$29, $ACC1, $TEMP2
448 vpand $AND_MASK, $ACC1, $ACC1
449
450 vpermq \$0x93, $TEMP1, $TEMP1
451 vpxor $ZERO, $ZERO, $ZERO
452 vpermq \$0x93, $TEMP2, $TEMP2
453
454 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
455 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
456 vpaddq $TEMP0, $ACC8, $ACC8
457 vpblendd \$3, $TEMP2, $ZERO, $TEMP2
458 vpaddq $TEMP1, $ACC1, $ACC1
459 vpaddq $TEMP2, $ACC2, $ACC2
460 vmovdqu $ACC1, 32*9-192($tp0)
461 vmovdqu $ACC2, 32*10-192($tp0)
462
463 mov (%rsp), %rax
464 mov 8(%rsp), $r1
465 mov 16(%rsp), $r2
466 mov 24(%rsp), $r3
467 vmovdqu 32*1(%rsp), $ACC1
468 vmovdqu 32*2-192($tp0), $ACC2
469 vmovdqu 32*3-192($tp0), $ACC3
470 vmovdqu 32*4-192($tp0), $ACC4
471 vmovdqu 32*5-192($tp0), $ACC5
472 vmovdqu 32*6-192($tp0), $ACC6
473 vmovdqu 32*7-192($tp0), $ACC7
474
475 mov %rax, $r0
476 imull $n0, %eax
477 and \$0x1fffffff, %eax
478 vmovd %eax, $Y1
479
480 mov %rax, %rdx
481 imulq -128($np), %rax
482 vpbroadcastq $Y1, $Y1
483 add %rax, $r0
484 mov %rdx, %rax
485 imulq 8-128($np), %rax
486 shr \$29, $r0
487 add %rax, $r1
488 mov %rdx, %rax
489 imulq 16-128($np), %rax
490 add $r0, $r1
491 add %rax, $r2
492 imulq 24-128($np), %rdx
493 add %rdx, $r3
494
495 mov $r1, %rax
496 imull $n0, %eax
497 and \$0x1fffffff, %eax
498
499 mov \$9, $i
500 jmp .LOOP_REDUCE_1024
501
502.align 32
503.LOOP_REDUCE_1024:
504 vmovd %eax, $Y2
505 vpbroadcastq $Y2, $Y2
506
507 vpmuludq 32*1-128($np), $Y1, $TEMP0
508 mov %rax, %rdx
509 imulq -128($np), %rax
510 vpaddq $TEMP0, $ACC1, $ACC1
511 add %rax, $r1
512 vpmuludq 32*2-128($np), $Y1, $TEMP1
513 mov %rdx, %rax
514 imulq 8-128($np), %rax
515 vpaddq $TEMP1, $ACC2, $ACC2
516 vpmuludq 32*3-128($np), $Y1, $TEMP2
517 .byte 0x67
518 add %rax, $r2
519 .byte 0x67
520 mov %rdx, %rax
521 imulq 16-128($np), %rax
522 shr \$29, $r1
523 vpaddq $TEMP2, $ACC3, $ACC3
524 vpmuludq 32*4-128($np), $Y1, $TEMP0
525 add %rax, $r3
526 add $r1, $r2
527 vpaddq $TEMP0, $ACC4, $ACC4
528 vpmuludq 32*5-128($np), $Y1, $TEMP1
529 mov $r2, %rax
530 imull $n0, %eax
531 vpaddq $TEMP1, $ACC5, $ACC5
532 vpmuludq 32*6-128($np), $Y1, $TEMP2
533 and \$0x1fffffff, %eax
534 vpaddq $TEMP2, $ACC6, $ACC6
535 vpmuludq 32*7-128($np), $Y1, $TEMP0
536 vpaddq $TEMP0, $ACC7, $ACC7
537 vpmuludq 32*8-128($np), $Y1, $TEMP1
538 vmovd %eax, $Y1
539 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
540 vpaddq $TEMP1, $ACC8, $ACC8
541 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
542 vpbroadcastq $Y1, $Y1
543
544 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
545 vmovdqu 32*3-8-128($np), $TEMP1
546 mov %rax, %rdx
547 imulq -128($np), %rax
548 vpaddq $TEMP2, $ACC1, $ACC1
549 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
550 vmovdqu 32*4-8-128($np), $TEMP2
551 add %rax, $r2
552 mov %rdx, %rax
553 imulq 8-128($np), %rax
554 vpaddq $TEMP0, $ACC2, $ACC2
555 add $r3, %rax
556 shr \$29, $r2
557 vpmuludq $Y2, $TEMP1, $TEMP1
558 vmovdqu 32*5-8-128($np), $TEMP0
559 add $r2, %rax
560 vpaddq $TEMP1, $ACC3, $ACC3
561 vpmuludq $Y2, $TEMP2, $TEMP2
562 vmovdqu 32*6-8-128($np), $TEMP1
563 .byte 0x67
564 mov %rax, $r3
565 imull $n0, %eax
566 vpaddq $TEMP2, $ACC4, $ACC4
567 vpmuludq $Y2, $TEMP0, $TEMP0
568 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
569 and \$0x1fffffff, %eax
570 vpaddq $TEMP0, $ACC5, $ACC5
571 vpmuludq $Y2, $TEMP1, $TEMP1
572 vmovdqu 32*8-8-128($np), $TEMP0
573 vpaddq $TEMP1, $ACC6, $ACC6
574 vpmuludq $Y2, $TEMP2, $TEMP2
575 vmovdqu 32*9-8-128($np), $ACC9
576 vmovd %eax, $ACC0 # borrow ACC0 for Y2
577 imulq -128($np), %rax
578 vpaddq $TEMP2, $ACC7, $ACC7
579 vpmuludq $Y2, $TEMP0, $TEMP0
580 vmovdqu 32*1-16-128($np), $TEMP1
581 vpbroadcastq $ACC0, $ACC0
582 vpaddq $TEMP0, $ACC8, $ACC8
583 vpmuludq $Y2, $ACC9, $ACC9
584 vmovdqu 32*2-16-128($np), $TEMP2
585 add %rax, $r3
586
587___
588($ACC0,$Y2)=($Y2,$ACC0);
589$code.=<<___;
590 vmovdqu 32*1-24-128($np), $ACC0
591 vpmuludq $Y1, $TEMP1, $TEMP1
592 vmovdqu 32*3-16-128($np), $TEMP0
593 vpaddq $TEMP1, $ACC1, $ACC1
594 vpmuludq $Y2, $ACC0, $ACC0
595 vpmuludq $Y1, $TEMP2, $TEMP2
596 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
597 vpaddq $ACC1, $ACC0, $ACC0
598 vpaddq $TEMP2, $ACC2, $ACC2
599 vpmuludq $Y1, $TEMP0, $TEMP0
600 vmovdqu 32*5-16-128($np), $TEMP2
601 .byte 0x67
602 vmovq $ACC0, %rax
603 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
604 vpaddq $TEMP0, $ACC3, $ACC3
605 vpmuludq $Y1, $TEMP1, $TEMP1
606 vmovdqu 32*6-16-128($np), $TEMP0
607 vpaddq $TEMP1, $ACC4, $ACC4
608 vpmuludq $Y1, $TEMP2, $TEMP2
609 vmovdqu 32*7-16-128($np), $TEMP1
610 vpaddq $TEMP2, $ACC5, $ACC5
611 vpmuludq $Y1, $TEMP0, $TEMP0
612 vmovdqu 32*8-16-128($np), $TEMP2
613 vpaddq $TEMP0, $ACC6, $ACC6
614 vpmuludq $Y1, $TEMP1, $TEMP1
615 shr \$29, $r3
616 vmovdqu 32*9-16-128($np), $TEMP0
617 add $r3, %rax
618 vpaddq $TEMP1, $ACC7, $ACC7
619 vpmuludq $Y1, $TEMP2, $TEMP2
620 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
621 mov %rax, $r0
622 imull $n0, %eax
623 vpaddq $TEMP2, $ACC8, $ACC8
624 vpmuludq $Y1, $TEMP0, $TEMP0
625 and \$0x1fffffff, %eax
626 vmovd %eax, $Y1
627 vmovdqu 32*3-24-128($np), $TEMP2
628 .byte 0x67
629 vpaddq $TEMP0, $ACC9, $ACC9
630 vpbroadcastq $Y1, $Y1
631
632 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
633 vmovdqu 32*4-24-128($np), $TEMP0
634 mov %rax, %rdx
635 imulq -128($np), %rax
636 mov 8(%rsp), $r1
637 vpaddq $TEMP1, $ACC2, $ACC1
638 vpmuludq $Y2, $TEMP2, $TEMP2
639 vmovdqu 32*5-24-128($np), $TEMP1
640 add %rax, $r0
641 mov %rdx, %rax
642 imulq 8-128($np), %rax
643 .byte 0x67
644 shr \$29, $r0
645 mov 16(%rsp), $r2
646 vpaddq $TEMP2, $ACC3, $ACC2
647 vpmuludq $Y2, $TEMP0, $TEMP0
648 vmovdqu 32*6-24-128($np), $TEMP2
649 add %rax, $r1
650 mov %rdx, %rax
651 imulq 16-128($np), %rax
652 vpaddq $TEMP0, $ACC4, $ACC3
653 vpmuludq $Y2, $TEMP1, $TEMP1
654 vmovdqu 32*7-24-128($np), $TEMP0
655 imulq 24-128($np), %rdx # future $r3
656 add %rax, $r2
657 lea ($r0,$r1), %rax
658 vpaddq $TEMP1, $ACC5, $ACC4
659 vpmuludq $Y2, $TEMP2, $TEMP2
660 vmovdqu 32*8-24-128($np), $TEMP1
661 mov %rax, $r1
662 imull $n0, %eax
663 vpmuludq $Y2, $TEMP0, $TEMP0
664 vpaddq $TEMP2, $ACC6, $ACC5
665 vmovdqu 32*9-24-128($np), $TEMP2
666 and \$0x1fffffff, %eax
667 vpaddq $TEMP0, $ACC7, $ACC6
668 vpmuludq $Y2, $TEMP1, $TEMP1
669 add 24(%rsp), %rdx
670 vpaddq $TEMP1, $ACC8, $ACC7
671 vpmuludq $Y2, $TEMP2, $TEMP2
672 vpaddq $TEMP2, $ACC9, $ACC8
673 vmovq $r3, $ACC9
674 mov %rdx, $r3
675
676 dec $i
677 jnz .LOOP_REDUCE_1024
678___
679($ACC0,$Y2)=($Y2,$ACC0);
680$code.=<<___;
681 lea 448(%rsp), $tp1 # size optimization
682 vpaddq $ACC9, $Y2, $ACC0
683 vpxor $ZERO, $ZERO, $ZERO
684
685 vpaddq 32*9-192($tp0), $ACC0, $ACC0
686 vpaddq 32*10-448($tp1), $ACC1, $ACC1
687 vpaddq 32*11-448($tp1), $ACC2, $ACC2
688 vpaddq 32*12-448($tp1), $ACC3, $ACC3
689 vpaddq 32*13-448($tp1), $ACC4, $ACC4
690 vpaddq 32*14-448($tp1), $ACC5, $ACC5
691 vpaddq 32*15-448($tp1), $ACC6, $ACC6
692 vpaddq 32*16-448($tp1), $ACC7, $ACC7
693 vpaddq 32*17-448($tp1), $ACC8, $ACC8
694
695 vpsrlq \$29, $ACC0, $TEMP1
696 vpand $AND_MASK, $ACC0, $ACC0
697 vpsrlq \$29, $ACC1, $TEMP2
698 vpand $AND_MASK, $ACC1, $ACC1
699 vpsrlq \$29, $ACC2, $TEMP3
700 vpermq \$0x93, $TEMP1, $TEMP1
701 vpand $AND_MASK, $ACC2, $ACC2
702 vpsrlq \$29, $ACC3, $TEMP4
703 vpermq \$0x93, $TEMP2, $TEMP2
704 vpand $AND_MASK, $ACC3, $ACC3
705 vpermq \$0x93, $TEMP3, $TEMP3
706
707 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
708 vpermq \$0x93, $TEMP4, $TEMP4
709 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
710 vpaddq $TEMP0, $ACC0, $ACC0
711 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
712 vpaddq $TEMP1, $ACC1, $ACC1
713 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
714 vpaddq $TEMP2, $ACC2, $ACC2
715 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
716 vpaddq $TEMP3, $ACC3, $ACC3
717 vpaddq $TEMP4, $ACC4, $ACC4
718
719 vpsrlq \$29, $ACC0, $TEMP1
720 vpand $AND_MASK, $ACC0, $ACC0
721 vpsrlq \$29, $ACC1, $TEMP2
722 vpand $AND_MASK, $ACC1, $ACC1
723 vpsrlq \$29, $ACC2, $TEMP3
724 vpermq \$0x93, $TEMP1, $TEMP1
725 vpand $AND_MASK, $ACC2, $ACC2
726 vpsrlq \$29, $ACC3, $TEMP4
727 vpermq \$0x93, $TEMP2, $TEMP2
728 vpand $AND_MASK, $ACC3, $ACC3
729 vpermq \$0x93, $TEMP3, $TEMP3
730
731 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
732 vpermq \$0x93, $TEMP4, $TEMP4
733 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
734 vpaddq $TEMP0, $ACC0, $ACC0
735 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
736 vpaddq $TEMP1, $ACC1, $ACC1
737 vmovdqu $ACC0, 32*0-128($rp)
738 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
739 vpaddq $TEMP2, $ACC2, $ACC2
740 vmovdqu $ACC1, 32*1-128($rp)
741 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
742 vpaddq $TEMP3, $ACC3, $ACC3
743 vmovdqu $ACC2, 32*2-128($rp)
744 vpaddq $TEMP4, $ACC4, $ACC4
745 vmovdqu $ACC3, 32*3-128($rp)
746___
747$TEMP5=$ACC0;
748$code.=<<___;
749 vpsrlq \$29, $ACC4, $TEMP1
750 vpand $AND_MASK, $ACC4, $ACC4
751 vpsrlq \$29, $ACC5, $TEMP2
752 vpand $AND_MASK, $ACC5, $ACC5
753 vpsrlq \$29, $ACC6, $TEMP3
754 vpermq \$0x93, $TEMP1, $TEMP1
755 vpand $AND_MASK, $ACC6, $ACC6
756 vpsrlq \$29, $ACC7, $TEMP4
757 vpermq \$0x93, $TEMP2, $TEMP2
758 vpand $AND_MASK, $ACC7, $ACC7
759 vpsrlq \$29, $ACC8, $TEMP5
760 vpermq \$0x93, $TEMP3, $TEMP3
761 vpand $AND_MASK, $ACC8, $ACC8
762 vpermq \$0x93, $TEMP4, $TEMP4
763
764 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
765 vpermq \$0x93, $TEMP5, $TEMP5
766 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
767 vpaddq $TEMP0, $ACC4, $ACC4
768 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
769 vpaddq $TEMP1, $ACC5, $ACC5
770 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
771 vpaddq $TEMP2, $ACC6, $ACC6
772 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
773 vpaddq $TEMP3, $ACC7, $ACC7
774 vpaddq $TEMP4, $ACC8, $ACC8
Robert Sloana94fe052017-02-21 08:49:28 -0800775
Adam Langleyd9e397b2015-01-22 14:27:53 -0800776 vpsrlq \$29, $ACC4, $TEMP1
777 vpand $AND_MASK, $ACC4, $ACC4
778 vpsrlq \$29, $ACC5, $TEMP2
779 vpand $AND_MASK, $ACC5, $ACC5
780 vpsrlq \$29, $ACC6, $TEMP3
781 vpermq \$0x93, $TEMP1, $TEMP1
782 vpand $AND_MASK, $ACC6, $ACC6
783 vpsrlq \$29, $ACC7, $TEMP4
784 vpermq \$0x93, $TEMP2, $TEMP2
785 vpand $AND_MASK, $ACC7, $ACC7
786 vpsrlq \$29, $ACC8, $TEMP5
787 vpermq \$0x93, $TEMP3, $TEMP3
788 vpand $AND_MASK, $ACC8, $ACC8
789 vpermq \$0x93, $TEMP4, $TEMP4
790
791 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
792 vpermq \$0x93, $TEMP5, $TEMP5
793 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
794 vpaddq $TEMP0, $ACC4, $ACC4
795 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
796 vpaddq $TEMP1, $ACC5, $ACC5
797 vmovdqu $ACC4, 32*4-128($rp)
798 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
799 vpaddq $TEMP2, $ACC6, $ACC6
800 vmovdqu $ACC5, 32*5-128($rp)
801 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
802 vpaddq $TEMP3, $ACC7, $ACC7
803 vmovdqu $ACC6, 32*6-128($rp)
804 vpaddq $TEMP4, $ACC8, $ACC8
805 vmovdqu $ACC7, 32*7-128($rp)
806 vmovdqu $ACC8, 32*8-128($rp)
807
808 mov $rp, $ap
809 dec $rep
810 jne .LOOP_GRANDE_SQR_1024
811
812 vzeroall
813 mov %rbp, %rax
Robert Sloana94fe052017-02-21 08:49:28 -0800814.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800815___
816$code.=<<___ if ($win64);
Robert Sloan5d625782017-02-13 09:55:39 -0800817.Lsqr_1024_in_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800818 movaps -0xd8(%rax),%xmm6
819 movaps -0xc8(%rax),%xmm7
820 movaps -0xb8(%rax),%xmm8
821 movaps -0xa8(%rax),%xmm9
822 movaps -0x98(%rax),%xmm10
823 movaps -0x88(%rax),%xmm11
824 movaps -0x78(%rax),%xmm12
825 movaps -0x68(%rax),%xmm13
826 movaps -0x58(%rax),%xmm14
827 movaps -0x48(%rax),%xmm15
828___
829$code.=<<___;
830 mov -48(%rax),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800831.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800832 mov -40(%rax),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800833.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800834 mov -32(%rax),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800835.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800836 mov -24(%rax),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800837.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800838 mov -16(%rax),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800839.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800840 mov -8(%rax),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800841.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800842 lea (%rax),%rsp # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800843.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800844.Lsqr_1024_epilogue:
845 ret
Robert Sloana94fe052017-02-21 08:49:28 -0800846.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800847.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
848___
849}
850
851{ # void AMM_WW(
852my $rp="%rdi"; # BN_ULONG *rp,
853my $ap="%rsi"; # const BN_ULONG *ap,
854my $bp="%rdx"; # const BN_ULONG *bp,
855my $np="%rcx"; # const BN_ULONG *np,
856my $n0="%r8d"; # unsigned int n0);
857
858# The registers that hold the accumulated redundant result
859# The AMM works on 1024 bit operands, and redundant word size is 29
860# Therefore: ceil(1024/29)/4 = 9
861my $ACC0="%ymm0";
862my $ACC1="%ymm1";
863my $ACC2="%ymm2";
864my $ACC3="%ymm3";
865my $ACC4="%ymm4";
866my $ACC5="%ymm5";
867my $ACC6="%ymm6";
868my $ACC7="%ymm7";
869my $ACC8="%ymm8";
870my $ACC9="%ymm9";
871
872# Registers that hold the broadcasted words of multiplier, currently used
873my $Bi="%ymm10";
874my $Yi="%ymm11";
875
876# Helper registers
877my $TEMP0=$ACC0;
878my $TEMP1="%ymm12";
879my $TEMP2="%ymm13";
880my $ZERO="%ymm14";
881my $AND_MASK="%ymm15";
882
883# alu registers that hold the first words of the ACC
884my $r0="%r9";
885my $r1="%r10";
886my $r2="%r11";
887my $r3="%r12";
888
889my $i="%r14d";
890my $tmp="%r15";
891
892$bp="%r13"; # reassigned argument
893
894$code.=<<___;
895.globl rsaz_1024_mul_avx2
896.type rsaz_1024_mul_avx2,\@function,5
897.align 64
898rsaz_1024_mul_avx2:
Robert Sloana94fe052017-02-21 08:49:28 -0800899.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800900 lea (%rsp), %rax
Robert Sloana94fe052017-02-21 08:49:28 -0800901.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800902 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800903.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800904 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800905.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800906 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800907.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800908 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800909.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800910 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800911.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800912 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800913.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800914___
915$code.=<<___ if ($win64);
916 vzeroupper
917 lea -0xa8(%rsp),%rsp
918 vmovaps %xmm6,-0xd8(%rax)
919 vmovaps %xmm7,-0xc8(%rax)
920 vmovaps %xmm8,-0xb8(%rax)
921 vmovaps %xmm9,-0xa8(%rax)
922 vmovaps %xmm10,-0x98(%rax)
923 vmovaps %xmm11,-0x88(%rax)
924 vmovaps %xmm12,-0x78(%rax)
925 vmovaps %xmm13,-0x68(%rax)
926 vmovaps %xmm14,-0x58(%rax)
927 vmovaps %xmm15,-0x48(%rax)
928.Lmul_1024_body:
929___
930$code.=<<___;
931 mov %rax,%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800932.cfi_def_cfa_register %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800933 vzeroall
934 mov %rdx, $bp # reassigned argument
935 sub \$64,%rsp
936
937 # unaligned 256-bit load that crosses page boundary can
938 # cause severe performance degradation here, so if $ap does
939 # cross page boundary, swap it with $bp [meaning that caller
940 # is advised to lay down $ap and $bp next to each other, so
941 # that only one can cross page boundary].
942 .byte 0x67,0x67
943 mov $ap, $tmp
944 and \$4095, $tmp
945 add \$32*10, $tmp
946 shr \$12, $tmp
947 mov $ap, $tmp
948 cmovnz $bp, $ap
949 cmovnz $tmp, $bp
950
951 mov $np, $tmp
952 sub \$-128,$ap # size optimization
953 sub \$-128,$np
954 sub \$-128,$rp
955
956 and \$4095, $tmp # see if $np crosses page
957 add \$32*10, $tmp
958 .byte 0x67,0x67
959 shr \$12, $tmp
960 jz .Lmul_1024_no_n_copy
961
962 # unaligned 256-bit load that crosses page boundary can
963 # cause severe performance degradation here, so if $np does
964 # cross page boundary, copy it to stack and make sure stack
965 # frame doesn't...
966 sub \$32*10,%rsp
967 vmovdqu 32*0-128($np), $ACC0
968 and \$-512, %rsp
969 vmovdqu 32*1-128($np), $ACC1
970 vmovdqu 32*2-128($np), $ACC2
971 vmovdqu 32*3-128($np), $ACC3
972 vmovdqu 32*4-128($np), $ACC4
973 vmovdqu 32*5-128($np), $ACC5
974 vmovdqu 32*6-128($np), $ACC6
975 vmovdqu 32*7-128($np), $ACC7
976 vmovdqu 32*8-128($np), $ACC8
977 lea 64+128(%rsp),$np
978 vmovdqu $ACC0, 32*0-128($np)
979 vpxor $ACC0, $ACC0, $ACC0
980 vmovdqu $ACC1, 32*1-128($np)
981 vpxor $ACC1, $ACC1, $ACC1
982 vmovdqu $ACC2, 32*2-128($np)
983 vpxor $ACC2, $ACC2, $ACC2
984 vmovdqu $ACC3, 32*3-128($np)
985 vpxor $ACC3, $ACC3, $ACC3
986 vmovdqu $ACC4, 32*4-128($np)
987 vpxor $ACC4, $ACC4, $ACC4
988 vmovdqu $ACC5, 32*5-128($np)
989 vpxor $ACC5, $ACC5, $ACC5
990 vmovdqu $ACC6, 32*6-128($np)
991 vpxor $ACC6, $ACC6, $ACC6
992 vmovdqu $ACC7, 32*7-128($np)
993 vpxor $ACC7, $ACC7, $ACC7
994 vmovdqu $ACC8, 32*8-128($np)
995 vmovdqa $ACC0, $ACC8
996 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
997.Lmul_1024_no_n_copy:
998 and \$-64,%rsp
999
1000 mov ($bp), %rbx
1001 vpbroadcastq ($bp), $Bi
1002 vmovdqu $ACC0, (%rsp) # clear top of stack
1003 xor $r0, $r0
1004 .byte 0x67
1005 xor $r1, $r1
1006 xor $r2, $r2
1007 xor $r3, $r3
1008
1009 vmovdqu .Land_mask(%rip), $AND_MASK
1010 mov \$9, $i
1011 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
1012 jmp .Loop_mul_1024
1013
1014.align 32
1015.Loop_mul_1024:
1016 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
1017 mov %rbx, %rax
1018 imulq -128($ap), %rax
1019 add $r0, %rax
1020 mov %rbx, $r1
1021 imulq 8-128($ap), $r1
1022 add 8(%rsp), $r1
1023
1024 mov %rax, $r0
1025 imull $n0, %eax
1026 and \$0x1fffffff, %eax
1027
1028 mov %rbx, $r2
1029 imulq 16-128($ap), $r2
1030 add 16(%rsp), $r2
1031
1032 mov %rbx, $r3
1033 imulq 24-128($ap), $r3
1034 add 24(%rsp), $r3
1035 vpmuludq 32*1-128($ap),$Bi,$TEMP0
1036 vmovd %eax, $Yi
1037 vpaddq $TEMP0,$ACC1,$ACC1
1038 vpmuludq 32*2-128($ap),$Bi,$TEMP1
1039 vpbroadcastq $Yi, $Yi
1040 vpaddq $TEMP1,$ACC2,$ACC2
1041 vpmuludq 32*3-128($ap),$Bi,$TEMP2
1042 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1043 vpaddq $TEMP2,$ACC3,$ACC3
1044 vpmuludq 32*4-128($ap),$Bi,$TEMP0
1045 vpaddq $TEMP0,$ACC4,$ACC4
1046 vpmuludq 32*5-128($ap),$Bi,$TEMP1
1047 vpaddq $TEMP1,$ACC5,$ACC5
1048 vpmuludq 32*6-128($ap),$Bi,$TEMP2
1049 vpaddq $TEMP2,$ACC6,$ACC6
1050 vpmuludq 32*7-128($ap),$Bi,$TEMP0
1051 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1052 vpaddq $TEMP0,$ACC7,$ACC7
1053 vpmuludq 32*8-128($ap),$Bi,$TEMP1
1054 vpbroadcastq 8($bp), $Bi
1055 vpaddq $TEMP1,$ACC8,$ACC8
1056
1057 mov %rax,%rdx
1058 imulq -128($np),%rax
1059 add %rax,$r0
1060 mov %rdx,%rax
1061 imulq 8-128($np),%rax
1062 add %rax,$r1
1063 mov %rdx,%rax
1064 imulq 16-128($np),%rax
1065 add %rax,$r2
1066 shr \$29, $r0
1067 imulq 24-128($np),%rdx
1068 add %rdx,$r3
1069 add $r0, $r1
1070
1071 vpmuludq 32*1-128($np),$Yi,$TEMP2
1072 vmovq $Bi, %rbx
1073 vpaddq $TEMP2,$ACC1,$ACC1
1074 vpmuludq 32*2-128($np),$Yi,$TEMP0
1075 vpaddq $TEMP0,$ACC2,$ACC2
1076 vpmuludq 32*3-128($np),$Yi,$TEMP1
1077 vpaddq $TEMP1,$ACC3,$ACC3
1078 vpmuludq 32*4-128($np),$Yi,$TEMP2
1079 vpaddq $TEMP2,$ACC4,$ACC4
1080 vpmuludq 32*5-128($np),$Yi,$TEMP0
1081 vpaddq $TEMP0,$ACC5,$ACC5
1082 vpmuludq 32*6-128($np),$Yi,$TEMP1
1083 vpaddq $TEMP1,$ACC6,$ACC6
1084 vpmuludq 32*7-128($np),$Yi,$TEMP2
1085 vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
1086 vpaddq $TEMP2,$ACC7,$ACC7
1087 vpmuludq 32*8-128($np),$Yi,$TEMP0
1088 vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
1089 vpaddq $TEMP0,$ACC8,$ACC8
1090
1091 mov %rbx, %rax
1092 imulq -128($ap),%rax
1093 add %rax,$r1
1094 vmovdqu -8+32*1-128($ap),$TEMP1
1095 mov %rbx, %rax
1096 imulq 8-128($ap),%rax
1097 add %rax,$r2
1098 vmovdqu -8+32*2-128($ap),$TEMP2
1099
1100 mov $r1, %rax
1101 imull $n0, %eax
1102 and \$0x1fffffff, %eax
1103
1104 imulq 16-128($ap),%rbx
1105 add %rbx,$r3
1106 vpmuludq $Bi,$TEMP1,$TEMP1
1107 vmovd %eax, $Yi
1108 vmovdqu -8+32*3-128($ap),$TEMP0
1109 vpaddq $TEMP1,$ACC1,$ACC1
1110 vpmuludq $Bi,$TEMP2,$TEMP2
1111 vpbroadcastq $Yi, $Yi
1112 vmovdqu -8+32*4-128($ap),$TEMP1
1113 vpaddq $TEMP2,$ACC2,$ACC2
1114 vpmuludq $Bi,$TEMP0,$TEMP0
1115 vmovdqu -8+32*5-128($ap),$TEMP2
1116 vpaddq $TEMP0,$ACC3,$ACC3
1117 vpmuludq $Bi,$TEMP1,$TEMP1
1118 vmovdqu -8+32*6-128($ap),$TEMP0
1119 vpaddq $TEMP1,$ACC4,$ACC4
1120 vpmuludq $Bi,$TEMP2,$TEMP2
1121 vmovdqu -8+32*7-128($ap),$TEMP1
1122 vpaddq $TEMP2,$ACC5,$ACC5
1123 vpmuludq $Bi,$TEMP0,$TEMP0
1124 vmovdqu -8+32*8-128($ap),$TEMP2
1125 vpaddq $TEMP0,$ACC6,$ACC6
1126 vpmuludq $Bi,$TEMP1,$TEMP1
1127 vmovdqu -8+32*9-128($ap),$ACC9
1128 vpaddq $TEMP1,$ACC7,$ACC7
1129 vpmuludq $Bi,$TEMP2,$TEMP2
1130 vpaddq $TEMP2,$ACC8,$ACC8
1131 vpmuludq $Bi,$ACC9,$ACC9
1132 vpbroadcastq 16($bp), $Bi
1133
1134 mov %rax,%rdx
1135 imulq -128($np),%rax
1136 add %rax,$r1
1137 vmovdqu -8+32*1-128($np),$TEMP0
1138 mov %rdx,%rax
1139 imulq 8-128($np),%rax
1140 add %rax,$r2
1141 vmovdqu -8+32*2-128($np),$TEMP1
1142 shr \$29, $r1
1143 imulq 16-128($np),%rdx
1144 add %rdx,$r3
1145 add $r1, $r2
1146
1147 vpmuludq $Yi,$TEMP0,$TEMP0
1148 vmovq $Bi, %rbx
1149 vmovdqu -8+32*3-128($np),$TEMP2
1150 vpaddq $TEMP0,$ACC1,$ACC1
1151 vpmuludq $Yi,$TEMP1,$TEMP1
1152 vmovdqu -8+32*4-128($np),$TEMP0
1153 vpaddq $TEMP1,$ACC2,$ACC2
1154 vpmuludq $Yi,$TEMP2,$TEMP2
1155 vmovdqu -8+32*5-128($np),$TEMP1
1156 vpaddq $TEMP2,$ACC3,$ACC3
1157 vpmuludq $Yi,$TEMP0,$TEMP0
1158 vmovdqu -8+32*6-128($np),$TEMP2
1159 vpaddq $TEMP0,$ACC4,$ACC4
1160 vpmuludq $Yi,$TEMP1,$TEMP1
1161 vmovdqu -8+32*7-128($np),$TEMP0
1162 vpaddq $TEMP1,$ACC5,$ACC5
1163 vpmuludq $Yi,$TEMP2,$TEMP2
1164 vmovdqu -8+32*8-128($np),$TEMP1
1165 vpaddq $TEMP2,$ACC6,$ACC6
1166 vpmuludq $Yi,$TEMP0,$TEMP0
1167 vmovdqu -8+32*9-128($np),$TEMP2
1168 vpaddq $TEMP0,$ACC7,$ACC7
1169 vpmuludq $Yi,$TEMP1,$TEMP1
1170 vpaddq $TEMP1,$ACC8,$ACC8
1171 vpmuludq $Yi,$TEMP2,$TEMP2
1172 vpaddq $TEMP2,$ACC9,$ACC9
1173
1174 vmovdqu -16+32*1-128($ap),$TEMP0
1175 mov %rbx,%rax
1176 imulq -128($ap),%rax
1177 add $r2,%rax
1178
1179 vmovdqu -16+32*2-128($ap),$TEMP1
1180 mov %rax,$r2
1181 imull $n0, %eax
1182 and \$0x1fffffff, %eax
1183
1184 imulq 8-128($ap),%rbx
1185 add %rbx,$r3
1186 vpmuludq $Bi,$TEMP0,$TEMP0
1187 vmovd %eax, $Yi
1188 vmovdqu -16+32*3-128($ap),$TEMP2
1189 vpaddq $TEMP0,$ACC1,$ACC1
1190 vpmuludq $Bi,$TEMP1,$TEMP1
1191 vpbroadcastq $Yi, $Yi
1192 vmovdqu -16+32*4-128($ap),$TEMP0
1193 vpaddq $TEMP1,$ACC2,$ACC2
1194 vpmuludq $Bi,$TEMP2,$TEMP2
1195 vmovdqu -16+32*5-128($ap),$TEMP1
1196 vpaddq $TEMP2,$ACC3,$ACC3
1197 vpmuludq $Bi,$TEMP0,$TEMP0
1198 vmovdqu -16+32*6-128($ap),$TEMP2
1199 vpaddq $TEMP0,$ACC4,$ACC4
1200 vpmuludq $Bi,$TEMP1,$TEMP1
1201 vmovdqu -16+32*7-128($ap),$TEMP0
1202 vpaddq $TEMP1,$ACC5,$ACC5
1203 vpmuludq $Bi,$TEMP2,$TEMP2
1204 vmovdqu -16+32*8-128($ap),$TEMP1
1205 vpaddq $TEMP2,$ACC6,$ACC6
1206 vpmuludq $Bi,$TEMP0,$TEMP0
1207 vmovdqu -16+32*9-128($ap),$TEMP2
1208 vpaddq $TEMP0,$ACC7,$ACC7
1209 vpmuludq $Bi,$TEMP1,$TEMP1
1210 vpaddq $TEMP1,$ACC8,$ACC8
1211 vpmuludq $Bi,$TEMP2,$TEMP2
1212 vpbroadcastq 24($bp), $Bi
1213 vpaddq $TEMP2,$ACC9,$ACC9
1214
1215 vmovdqu -16+32*1-128($np),$TEMP0
1216 mov %rax,%rdx
1217 imulq -128($np),%rax
1218 add %rax,$r2
1219 vmovdqu -16+32*2-128($np),$TEMP1
1220 imulq 8-128($np),%rdx
1221 add %rdx,$r3
1222 shr \$29, $r2
1223
1224 vpmuludq $Yi,$TEMP0,$TEMP0
1225 vmovq $Bi, %rbx
1226 vmovdqu -16+32*3-128($np),$TEMP2
1227 vpaddq $TEMP0,$ACC1,$ACC1
1228 vpmuludq $Yi,$TEMP1,$TEMP1
1229 vmovdqu -16+32*4-128($np),$TEMP0
1230 vpaddq $TEMP1,$ACC2,$ACC2
1231 vpmuludq $Yi,$TEMP2,$TEMP2
1232 vmovdqu -16+32*5-128($np),$TEMP1
1233 vpaddq $TEMP2,$ACC3,$ACC3
1234 vpmuludq $Yi,$TEMP0,$TEMP0
1235 vmovdqu -16+32*6-128($np),$TEMP2
1236 vpaddq $TEMP0,$ACC4,$ACC4
1237 vpmuludq $Yi,$TEMP1,$TEMP1
1238 vmovdqu -16+32*7-128($np),$TEMP0
1239 vpaddq $TEMP1,$ACC5,$ACC5
1240 vpmuludq $Yi,$TEMP2,$TEMP2
1241 vmovdqu -16+32*8-128($np),$TEMP1
1242 vpaddq $TEMP2,$ACC6,$ACC6
1243 vpmuludq $Yi,$TEMP0,$TEMP0
1244 vmovdqu -16+32*9-128($np),$TEMP2
1245 vpaddq $TEMP0,$ACC7,$ACC7
1246 vpmuludq $Yi,$TEMP1,$TEMP1
1247 vmovdqu -24+32*1-128($ap),$TEMP0
1248 vpaddq $TEMP1,$ACC8,$ACC8
1249 vpmuludq $Yi,$TEMP2,$TEMP2
1250 vmovdqu -24+32*2-128($ap),$TEMP1
1251 vpaddq $TEMP2,$ACC9,$ACC9
1252
1253 add $r2, $r3
1254 imulq -128($ap),%rbx
1255 add %rbx,$r3
1256
1257 mov $r3, %rax
1258 imull $n0, %eax
1259 and \$0x1fffffff, %eax
1260
1261 vpmuludq $Bi,$TEMP0,$TEMP0
1262 vmovd %eax, $Yi
1263 vmovdqu -24+32*3-128($ap),$TEMP2
1264 vpaddq $TEMP0,$ACC1,$ACC1
1265 vpmuludq $Bi,$TEMP1,$TEMP1
1266 vpbroadcastq $Yi, $Yi
1267 vmovdqu -24+32*4-128($ap),$TEMP0
1268 vpaddq $TEMP1,$ACC2,$ACC2
1269 vpmuludq $Bi,$TEMP2,$TEMP2
1270 vmovdqu -24+32*5-128($ap),$TEMP1
1271 vpaddq $TEMP2,$ACC3,$ACC3
1272 vpmuludq $Bi,$TEMP0,$TEMP0
1273 vmovdqu -24+32*6-128($ap),$TEMP2
1274 vpaddq $TEMP0,$ACC4,$ACC4
1275 vpmuludq $Bi,$TEMP1,$TEMP1
1276 vmovdqu -24+32*7-128($ap),$TEMP0
1277 vpaddq $TEMP1,$ACC5,$ACC5
1278 vpmuludq $Bi,$TEMP2,$TEMP2
1279 vmovdqu -24+32*8-128($ap),$TEMP1
1280 vpaddq $TEMP2,$ACC6,$ACC6
1281 vpmuludq $Bi,$TEMP0,$TEMP0
1282 vmovdqu -24+32*9-128($ap),$TEMP2
1283 vpaddq $TEMP0,$ACC7,$ACC7
1284 vpmuludq $Bi,$TEMP1,$TEMP1
1285 vpaddq $TEMP1,$ACC8,$ACC8
1286 vpmuludq $Bi,$TEMP2,$TEMP2
1287 vpbroadcastq 32($bp), $Bi
1288 vpaddq $TEMP2,$ACC9,$ACC9
1289 add \$32, $bp # $bp++
1290
1291 vmovdqu -24+32*1-128($np),$TEMP0
1292 imulq -128($np),%rax
1293 add %rax,$r3
1294 shr \$29, $r3
1295
1296 vmovdqu -24+32*2-128($np),$TEMP1
1297 vpmuludq $Yi,$TEMP0,$TEMP0
1298 vmovq $Bi, %rbx
1299 vmovdqu -24+32*3-128($np),$TEMP2
1300 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1301 vpmuludq $Yi,$TEMP1,$TEMP1
1302 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1303 vpaddq $TEMP1,$ACC2,$ACC1
1304 vmovdqu -24+32*4-128($np),$TEMP0
1305 vpmuludq $Yi,$TEMP2,$TEMP2
1306 vmovdqu -24+32*5-128($np),$TEMP1
1307 vpaddq $TEMP2,$ACC3,$ACC2
1308 vpmuludq $Yi,$TEMP0,$TEMP0
1309 vmovdqu -24+32*6-128($np),$TEMP2
1310 vpaddq $TEMP0,$ACC4,$ACC3
1311 vpmuludq $Yi,$TEMP1,$TEMP1
1312 vmovdqu -24+32*7-128($np),$TEMP0
1313 vpaddq $TEMP1,$ACC5,$ACC4
1314 vpmuludq $Yi,$TEMP2,$TEMP2
1315 vmovdqu -24+32*8-128($np),$TEMP1
1316 vpaddq $TEMP2,$ACC6,$ACC5
1317 vpmuludq $Yi,$TEMP0,$TEMP0
1318 vmovdqu -24+32*9-128($np),$TEMP2
1319 mov $r3, $r0
1320 vpaddq $TEMP0,$ACC7,$ACC6
1321 vpmuludq $Yi,$TEMP1,$TEMP1
1322 add (%rsp), $r0
1323 vpaddq $TEMP1,$ACC8,$ACC7
1324 vpmuludq $Yi,$TEMP2,$TEMP2
1325 vmovq $r3, $TEMP1
1326 vpaddq $TEMP2,$ACC9,$ACC8
1327
1328 dec $i
1329 jnz .Loop_mul_1024
1330___
1331
1332# (*) Original implementation was correcting ACC1-ACC3 for overflow
1333# after 7 loop runs, or after 28 iterations, or 56 additions.
1334# But as we underutilize resources, it's possible to correct in
1335# each iteration with marginal performance loss. But then, as
1336# we do it in each iteration, we can correct less digits, and
1337# avoid performance penalties completely. Also note that we
1338# correct only three digits out of four. This works because
1339# most significant digit is subjected to less additions.
1340
1341$TEMP0 = $ACC9;
1342$TEMP3 = $Bi;
1343$TEMP4 = $Yi;
1344$code.=<<___;
1345 vpermq \$0, $AND_MASK, $AND_MASK
1346 vpaddq (%rsp), $TEMP1, $ACC0
1347
1348 vpsrlq \$29, $ACC0, $TEMP1
1349 vpand $AND_MASK, $ACC0, $ACC0
1350 vpsrlq \$29, $ACC1, $TEMP2
1351 vpand $AND_MASK, $ACC1, $ACC1
1352 vpsrlq \$29, $ACC2, $TEMP3
1353 vpermq \$0x93, $TEMP1, $TEMP1
1354 vpand $AND_MASK, $ACC2, $ACC2
1355 vpsrlq \$29, $ACC3, $TEMP4
1356 vpermq \$0x93, $TEMP2, $TEMP2
1357 vpand $AND_MASK, $ACC3, $ACC3
1358
1359 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1360 vpermq \$0x93, $TEMP3, $TEMP3
1361 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1362 vpermq \$0x93, $TEMP4, $TEMP4
1363 vpaddq $TEMP0, $ACC0, $ACC0
1364 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1365 vpaddq $TEMP1, $ACC1, $ACC1
1366 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1367 vpaddq $TEMP2, $ACC2, $ACC2
1368 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1369 vpaddq $TEMP3, $ACC3, $ACC3
1370 vpaddq $TEMP4, $ACC4, $ACC4
1371
1372 vpsrlq \$29, $ACC0, $TEMP1
1373 vpand $AND_MASK, $ACC0, $ACC0
1374 vpsrlq \$29, $ACC1, $TEMP2
1375 vpand $AND_MASK, $ACC1, $ACC1
1376 vpsrlq \$29, $ACC2, $TEMP3
1377 vpermq \$0x93, $TEMP1, $TEMP1
1378 vpand $AND_MASK, $ACC2, $ACC2
1379 vpsrlq \$29, $ACC3, $TEMP4
1380 vpermq \$0x93, $TEMP2, $TEMP2
1381 vpand $AND_MASK, $ACC3, $ACC3
1382 vpermq \$0x93, $TEMP3, $TEMP3
1383
1384 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1385 vpermq \$0x93, $TEMP4, $TEMP4
1386 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1387 vpaddq $TEMP0, $ACC0, $ACC0
1388 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1389 vpaddq $TEMP1, $ACC1, $ACC1
1390 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1391 vpaddq $TEMP2, $ACC2, $ACC2
1392 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1393 vpaddq $TEMP3, $ACC3, $ACC3
1394 vpaddq $TEMP4, $ACC4, $ACC4
1395
1396 vmovdqu $ACC0, 0-128($rp)
1397 vmovdqu $ACC1, 32-128($rp)
1398 vmovdqu $ACC2, 64-128($rp)
1399 vmovdqu $ACC3, 96-128($rp)
1400___
1401
1402$TEMP5=$ACC0;
1403$code.=<<___;
1404 vpsrlq \$29, $ACC4, $TEMP1
1405 vpand $AND_MASK, $ACC4, $ACC4
1406 vpsrlq \$29, $ACC5, $TEMP2
1407 vpand $AND_MASK, $ACC5, $ACC5
1408 vpsrlq \$29, $ACC6, $TEMP3
1409 vpermq \$0x93, $TEMP1, $TEMP1
1410 vpand $AND_MASK, $ACC6, $ACC6
1411 vpsrlq \$29, $ACC7, $TEMP4
1412 vpermq \$0x93, $TEMP2, $TEMP2
1413 vpand $AND_MASK, $ACC7, $ACC7
1414 vpsrlq \$29, $ACC8, $TEMP5
1415 vpermq \$0x93, $TEMP3, $TEMP3
1416 vpand $AND_MASK, $ACC8, $ACC8
1417 vpermq \$0x93, $TEMP4, $TEMP4
1418
1419 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1420 vpermq \$0x93, $TEMP5, $TEMP5
1421 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1422 vpaddq $TEMP0, $ACC4, $ACC4
1423 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1424 vpaddq $TEMP1, $ACC5, $ACC5
1425 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1426 vpaddq $TEMP2, $ACC6, $ACC6
1427 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1428 vpaddq $TEMP3, $ACC7, $ACC7
1429 vpaddq $TEMP4, $ACC8, $ACC8
1430
1431 vpsrlq \$29, $ACC4, $TEMP1
1432 vpand $AND_MASK, $ACC4, $ACC4
1433 vpsrlq \$29, $ACC5, $TEMP2
1434 vpand $AND_MASK, $ACC5, $ACC5
1435 vpsrlq \$29, $ACC6, $TEMP3
1436 vpermq \$0x93, $TEMP1, $TEMP1
1437 vpand $AND_MASK, $ACC6, $ACC6
1438 vpsrlq \$29, $ACC7, $TEMP4
1439 vpermq \$0x93, $TEMP2, $TEMP2
1440 vpand $AND_MASK, $ACC7, $ACC7
1441 vpsrlq \$29, $ACC8, $TEMP5
1442 vpermq \$0x93, $TEMP3, $TEMP3
1443 vpand $AND_MASK, $ACC8, $ACC8
1444 vpermq \$0x93, $TEMP4, $TEMP4
1445
1446 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1447 vpermq \$0x93, $TEMP5, $TEMP5
1448 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1449 vpaddq $TEMP0, $ACC4, $ACC4
1450 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1451 vpaddq $TEMP1, $ACC5, $ACC5
1452 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1453 vpaddq $TEMP2, $ACC6, $ACC6
1454 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1455 vpaddq $TEMP3, $ACC7, $ACC7
1456 vpaddq $TEMP4, $ACC8, $ACC8
1457
1458 vmovdqu $ACC4, 128-128($rp)
Robert Sloana94fe052017-02-21 08:49:28 -08001459 vmovdqu $ACC5, 160-128($rp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001460 vmovdqu $ACC6, 192-128($rp)
1461 vmovdqu $ACC7, 224-128($rp)
1462 vmovdqu $ACC8, 256-128($rp)
1463 vzeroupper
1464
1465 mov %rbp, %rax
Robert Sloana94fe052017-02-21 08:49:28 -08001466.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001467___
1468$code.=<<___ if ($win64);
Robert Sloan5d625782017-02-13 09:55:39 -08001469.Lmul_1024_in_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001470 movaps -0xd8(%rax),%xmm6
1471 movaps -0xc8(%rax),%xmm7
1472 movaps -0xb8(%rax),%xmm8
1473 movaps -0xa8(%rax),%xmm9
1474 movaps -0x98(%rax),%xmm10
1475 movaps -0x88(%rax),%xmm11
1476 movaps -0x78(%rax),%xmm12
1477 movaps -0x68(%rax),%xmm13
1478 movaps -0x58(%rax),%xmm14
1479 movaps -0x48(%rax),%xmm15
1480___
1481$code.=<<___;
1482 mov -48(%rax),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08001483.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001484 mov -40(%rax),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08001485.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001486 mov -32(%rax),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08001487.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001488 mov -24(%rax),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08001489.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001490 mov -16(%rax),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001491.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001492 mov -8(%rax),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001493.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001494 lea (%rax),%rsp # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001495.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001496.Lmul_1024_epilogue:
1497 ret
Robert Sloana94fe052017-02-21 08:49:28 -08001498.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001499.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1500___
1501}
1502{
1503my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1504my @T = map("%r$_",(8..11));
1505
1506$code.=<<___;
1507.globl rsaz_1024_red2norm_avx2
1508.type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1509.align 32
1510rsaz_1024_red2norm_avx2:
1511 sub \$-128,$inp # size optimization
1512 xor %rax,%rax
1513___
1514
1515for ($j=0,$i=0; $i<16; $i++) {
1516 my $k=0;
1517 while (29*$j<64*($i+1)) { # load data till boundary
1518 $code.=" mov `8*$j-128`($inp), @T[0]\n";
1519 $j++; $k++; push(@T,shift(@T));
1520 }
1521 $l=$k;
1522 while ($k>1) { # shift loaded data but last value
1523 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1524 $k--;
1525 }
1526 $code.=<<___; # shift last value
1527 mov @T[-1], @T[0]
1528 shl \$`29*($j-1)`, @T[-1]
1529 shr \$`-29*($j-1)`, @T[0]
1530___
1531 while ($l) { # accumulate all values
1532 $code.=" add @T[-$l], %rax\n";
1533 $l--;
1534 }
1535 $code.=<<___;
1536 adc \$0, @T[0] # consume eventual carry
1537 mov %rax, 8*$i($out)
1538 mov @T[0], %rax
1539___
1540 push(@T,shift(@T));
1541}
1542$code.=<<___;
1543 ret
1544.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1545
1546.globl rsaz_1024_norm2red_avx2
1547.type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1548.align 32
1549rsaz_1024_norm2red_avx2:
1550 sub \$-128,$out # size optimization
1551 mov ($inp),@T[0]
1552 mov \$0x1fffffff,%eax
1553___
1554for ($j=0,$i=0; $i<16; $i++) {
1555 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1556 $code.=" xor @T[1],@T[1]\n" if ($i==15);
1557 my $k=1;
1558 while (29*($j+1)<64*($i+1)) {
1559 $code.=<<___;
1560 mov @T[0],@T[-$k]
1561 shr \$`29*$j`,@T[-$k]
1562 and %rax,@T[-$k] # &0x1fffffff
1563 mov @T[-$k],`8*$j-128`($out)
1564___
1565 $j++; $k++;
1566 }
1567 $code.=<<___;
1568 shrd \$`29*$j`,@T[1],@T[0]
1569 and %rax,@T[0]
1570 mov @T[0],`8*$j-128`($out)
1571___
1572 $j++;
1573 push(@T,shift(@T));
1574}
1575$code.=<<___;
1576 mov @T[0],`8*$j-128`($out) # zero
1577 mov @T[0],`8*($j+1)-128`($out)
1578 mov @T[0],`8*($j+2)-128`($out)
1579 mov @T[0],`8*($j+3)-128`($out)
1580 ret
1581.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1582___
1583}
1584{
1585my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1586
1587$code.=<<___;
1588.globl rsaz_1024_scatter5_avx2
1589.type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1590.align 32
1591rsaz_1024_scatter5_avx2:
1592 vzeroupper
1593 vmovdqu .Lscatter_permd(%rip),%ymm5
1594 shl \$4,$power
1595 lea ($out,$power),$out
1596 mov \$9,%eax
1597 jmp .Loop_scatter_1024
1598
1599.align 32
1600.Loop_scatter_1024:
1601 vmovdqu ($inp),%ymm0
1602 lea 32($inp),$inp
1603 vpermd %ymm0,%ymm5,%ymm0
1604 vmovdqu %xmm0,($out)
1605 lea 16*32($out),$out
1606 dec %eax
1607 jnz .Loop_scatter_1024
1608
1609 vzeroupper
1610 ret
1611.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1612
1613.globl rsaz_1024_gather5_avx2
1614.type rsaz_1024_gather5_avx2,\@abi-omnipotent
1615.align 32
1616rsaz_1024_gather5_avx2:
Robert Sloana94fe052017-02-21 08:49:28 -08001617.cfi_startproc
David Benjamin4969cc92016-04-22 15:02:23 -04001618 vzeroupper
1619 mov %rsp,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001620.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001621___
1622$code.=<<___ if ($win64);
1623 lea -0x88(%rsp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001624.LSEH_begin_rsaz_1024_gather5:
1625 # I can't trust assembler to use specific encoding:-(
David Benjamin4969cc92016-04-22 15:02:23 -04001626 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
1627 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
1628 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
1629 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
1630 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
1631 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
1632 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
1633 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
1634 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
1635 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
1636 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001637___
1638$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04001639 lea -0x100(%rsp),%rsp
1640 and \$-32, %rsp
1641 lea .Linc(%rip), %r10
1642 lea -128(%rsp),%rax # control u-op density
Adam Langleyd9e397b2015-01-22 14:27:53 -08001643
David Benjamin4969cc92016-04-22 15:02:23 -04001644 vmovd $power, %xmm4
1645 vmovdqa (%r10),%ymm0
1646 vmovdqa 32(%r10),%ymm1
1647 vmovdqa 64(%r10),%ymm5
1648 vpbroadcastd %xmm4,%ymm4
Adam Langleyd9e397b2015-01-22 14:27:53 -08001649
David Benjamin4969cc92016-04-22 15:02:23 -04001650 vpaddd %ymm5, %ymm0, %ymm2
1651 vpcmpeqd %ymm4, %ymm0, %ymm0
1652 vpaddd %ymm5, %ymm1, %ymm3
1653 vpcmpeqd %ymm4, %ymm1, %ymm1
1654 vmovdqa %ymm0, 32*0+128(%rax)
1655 vpaddd %ymm5, %ymm2, %ymm0
1656 vpcmpeqd %ymm4, %ymm2, %ymm2
1657 vmovdqa %ymm1, 32*1+128(%rax)
1658 vpaddd %ymm5, %ymm3, %ymm1
1659 vpcmpeqd %ymm4, %ymm3, %ymm3
1660 vmovdqa %ymm2, 32*2+128(%rax)
1661 vpaddd %ymm5, %ymm0, %ymm2
1662 vpcmpeqd %ymm4, %ymm0, %ymm0
1663 vmovdqa %ymm3, 32*3+128(%rax)
1664 vpaddd %ymm5, %ymm1, %ymm3
1665 vpcmpeqd %ymm4, %ymm1, %ymm1
1666 vmovdqa %ymm0, 32*4+128(%rax)
1667 vpaddd %ymm5, %ymm2, %ymm8
1668 vpcmpeqd %ymm4, %ymm2, %ymm2
1669 vmovdqa %ymm1, 32*5+128(%rax)
1670 vpaddd %ymm5, %ymm3, %ymm9
1671 vpcmpeqd %ymm4, %ymm3, %ymm3
1672 vmovdqa %ymm2, 32*6+128(%rax)
1673 vpaddd %ymm5, %ymm8, %ymm10
1674 vpcmpeqd %ymm4, %ymm8, %ymm8
1675 vmovdqa %ymm3, 32*7+128(%rax)
1676 vpaddd %ymm5, %ymm9, %ymm11
1677 vpcmpeqd %ymm4, %ymm9, %ymm9
1678 vpaddd %ymm5, %ymm10, %ymm12
1679 vpcmpeqd %ymm4, %ymm10, %ymm10
1680 vpaddd %ymm5, %ymm11, %ymm13
1681 vpcmpeqd %ymm4, %ymm11, %ymm11
1682 vpaddd %ymm5, %ymm12, %ymm14
1683 vpcmpeqd %ymm4, %ymm12, %ymm12
1684 vpaddd %ymm5, %ymm13, %ymm15
1685 vpcmpeqd %ymm4, %ymm13, %ymm13
1686 vpcmpeqd %ymm4, %ymm14, %ymm14
1687 vpcmpeqd %ymm4, %ymm15, %ymm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001688
David Benjamin4969cc92016-04-22 15:02:23 -04001689 vmovdqa -32(%r10),%ymm7 # .Lgather_permd
1690 lea 128($inp), $inp
1691 mov \$9,$power
1692
Adam Langleyd9e397b2015-01-22 14:27:53 -08001693.Loop_gather_1024:
David Benjamin4969cc92016-04-22 15:02:23 -04001694 vmovdqa 32*0-128($inp), %ymm0
1695 vmovdqa 32*1-128($inp), %ymm1
1696 vmovdqa 32*2-128($inp), %ymm2
1697 vmovdqa 32*3-128($inp), %ymm3
1698 vpand 32*0+128(%rax), %ymm0, %ymm0
1699 vpand 32*1+128(%rax), %ymm1, %ymm1
1700 vpand 32*2+128(%rax), %ymm2, %ymm2
1701 vpor %ymm0, %ymm1, %ymm4
1702 vpand 32*3+128(%rax), %ymm3, %ymm3
1703 vmovdqa 32*4-128($inp), %ymm0
1704 vmovdqa 32*5-128($inp), %ymm1
1705 vpor %ymm2, %ymm3, %ymm5
1706 vmovdqa 32*6-128($inp), %ymm2
1707 vmovdqa 32*7-128($inp), %ymm3
1708 vpand 32*4+128(%rax), %ymm0, %ymm0
1709 vpand 32*5+128(%rax), %ymm1, %ymm1
1710 vpand 32*6+128(%rax), %ymm2, %ymm2
1711 vpor %ymm0, %ymm4, %ymm4
1712 vpand 32*7+128(%rax), %ymm3, %ymm3
1713 vpand 32*8-128($inp), %ymm8, %ymm0
1714 vpor %ymm1, %ymm5, %ymm5
1715 vpand 32*9-128($inp), %ymm9, %ymm1
1716 vpor %ymm2, %ymm4, %ymm4
1717 vpand 32*10-128($inp),%ymm10, %ymm2
1718 vpor %ymm3, %ymm5, %ymm5
1719 vpand 32*11-128($inp),%ymm11, %ymm3
1720 vpor %ymm0, %ymm4, %ymm4
1721 vpand 32*12-128($inp),%ymm12, %ymm0
1722 vpor %ymm1, %ymm5, %ymm5
1723 vpand 32*13-128($inp),%ymm13, %ymm1
1724 vpor %ymm2, %ymm4, %ymm4
1725 vpand 32*14-128($inp),%ymm14, %ymm2
1726 vpor %ymm3, %ymm5, %ymm5
1727 vpand 32*15-128($inp),%ymm15, %ymm3
1728 lea 32*16($inp), $inp
1729 vpor %ymm0, %ymm4, %ymm4
1730 vpor %ymm1, %ymm5, %ymm5
1731 vpor %ymm2, %ymm4, %ymm4
1732 vpor %ymm3, %ymm5, %ymm5
1733
1734 vpor %ymm5, %ymm4, %ymm4
1735 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
1736 vpor %xmm4, %xmm5, %xmm5
1737 vpermd %ymm5,%ymm7,%ymm5
1738 vmovdqu %ymm5,($out)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001739 lea 32($out),$out
David Benjamin4969cc92016-04-22 15:02:23 -04001740 dec $power
Adam Langleyd9e397b2015-01-22 14:27:53 -08001741 jnz .Loop_gather_1024
1742
1743 vpxor %ymm0,%ymm0,%ymm0
1744 vmovdqu %ymm0,($out)
1745 vzeroupper
1746___
1747$code.=<<___ if ($win64);
David Benjamin4969cc92016-04-22 15:02:23 -04001748 movaps -0xa8(%r11),%xmm6
1749 movaps -0x98(%r11),%xmm7
1750 movaps -0x88(%r11),%xmm8
1751 movaps -0x78(%r11),%xmm9
1752 movaps -0x68(%r11),%xmm10
1753 movaps -0x58(%r11),%xmm11
1754 movaps -0x48(%r11),%xmm12
1755 movaps -0x38(%r11),%xmm13
1756 movaps -0x28(%r11),%xmm14
1757 movaps -0x18(%r11),%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001758___
1759$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04001760 lea (%r11),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001761.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001762 ret
Robert Sloana94fe052017-02-21 08:49:28 -08001763.cfi_endproc
1764.LSEH_end_rsaz_1024_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001765.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1766___
1767}
1768
1769$code.=<<___;
1770.extern OPENSSL_ia32cap_P
1771.globl rsaz_avx2_eligible
1772.type rsaz_avx2_eligible,\@abi-omnipotent
1773.align 32
1774rsaz_avx2_eligible:
Robert Sloan8ff03552017-06-14 12:40:58 -07001775 leaq OPENSSL_ia32cap_P(%rip),%rax
1776 mov 8(%rax),%eax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001777___
1778$code.=<<___ if ($addx);
1779 mov \$`1<<8|1<<19`,%ecx
1780 mov \$0,%edx
1781 and %eax,%ecx
1782 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
1783 cmove %edx,%eax
1784___
1785$code.=<<___;
1786 and \$`1<<5`,%eax
1787 shr \$5,%eax
1788 ret
1789.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1790
1791.align 64
1792.Land_mask:
1793 .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1794.Lscatter_permd:
1795 .long 0,2,4,6,7,7,7,7
1796.Lgather_permd:
1797 .long 0,7,1,7,2,7,3,7
David Benjamin4969cc92016-04-22 15:02:23 -04001798.Linc:
1799 .long 0,0,0,0, 1,1,1,1
1800 .long 2,2,2,2, 3,3,3,3
1801 .long 4,4,4,4, 4,4,4,4
Adam Langleyd9e397b2015-01-22 14:27:53 -08001802.align 64
1803___
1804
1805if ($win64) {
1806$rec="%rcx";
1807$frame="%rdx";
1808$context="%r8";
1809$disp="%r9";
1810
1811$code.=<<___
1812.extern __imp_RtlVirtualUnwind
1813.type rsaz_se_handler,\@abi-omnipotent
1814.align 16
1815rsaz_se_handler:
1816 push %rsi
1817 push %rdi
1818 push %rbx
1819 push %rbp
1820 push %r12
1821 push %r13
1822 push %r14
1823 push %r15
1824 pushfq
1825 sub \$64,%rsp
1826
1827 mov 120($context),%rax # pull context->Rax
1828 mov 248($context),%rbx # pull context->Rip
1829
1830 mov 8($disp),%rsi # disp->ImageBase
1831 mov 56($disp),%r11 # disp->HandlerData
1832
1833 mov 0(%r11),%r10d # HandlerData[0]
1834 lea (%rsi,%r10),%r10 # prologue label
1835 cmp %r10,%rbx # context->Rip<prologue label
1836 jb .Lcommon_seh_tail
1837
Adam Langleyd9e397b2015-01-22 14:27:53 -08001838 mov 4(%r11),%r10d # HandlerData[1]
1839 lea (%rsi,%r10),%r10 # epilogue label
1840 cmp %r10,%rbx # context->Rip>=epilogue label
1841 jae .Lcommon_seh_tail
1842
Robert Sloan5d625782017-02-13 09:55:39 -08001843 mov 160($context),%rbp # pull context->Rbp
1844
1845 mov 8(%r11),%r10d # HandlerData[2]
1846 lea (%rsi,%r10),%r10 # "in tail" label
1847 cmp %r10,%rbx # context->Rip>="in tail" label
1848 cmovc %rbp,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001849
1850 mov -48(%rax),%r15
1851 mov -40(%rax),%r14
1852 mov -32(%rax),%r13
1853 mov -24(%rax),%r12
1854 mov -16(%rax),%rbp
1855 mov -8(%rax),%rbx
1856 mov %r15,240($context)
1857 mov %r14,232($context)
1858 mov %r13,224($context)
1859 mov %r12,216($context)
1860 mov %rbp,160($context)
1861 mov %rbx,144($context)
1862
1863 lea -0xd8(%rax),%rsi # %xmm save area
1864 lea 512($context),%rdi # & context.Xmm6
1865 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1866 .long 0xa548f3fc # cld; rep movsq
1867
1868.Lcommon_seh_tail:
1869 mov 8(%rax),%rdi
1870 mov 16(%rax),%rsi
1871 mov %rax,152($context) # restore context->Rsp
1872 mov %rsi,168($context) # restore context->Rsi
1873 mov %rdi,176($context) # restore context->Rdi
1874
1875 mov 40($disp),%rdi # disp->ContextRecord
1876 mov $context,%rsi # context
1877 mov \$154,%ecx # sizeof(CONTEXT)
1878 .long 0xa548f3fc # cld; rep movsq
1879
1880 mov $disp,%rsi
1881 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1882 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1883 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1884 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1885 mov 40(%rsi),%r10 # disp->ContextRecord
1886 lea 56(%rsi),%r11 # &disp->HandlerData
1887 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1888 mov %r10,32(%rsp) # arg5
1889 mov %r11,40(%rsp) # arg6
1890 mov %r12,48(%rsp) # arg7
1891 mov %rcx,56(%rsp) # arg8, (NULL)
1892 call *__imp_RtlVirtualUnwind(%rip)
1893
1894 mov \$1,%eax # ExceptionContinueSearch
1895 add \$64,%rsp
1896 popfq
1897 pop %r15
1898 pop %r14
1899 pop %r13
1900 pop %r12
1901 pop %rbp
1902 pop %rbx
1903 pop %rdi
1904 pop %rsi
1905 ret
1906.size rsaz_se_handler,.-rsaz_se_handler
1907
1908.section .pdata
1909.align 4
1910 .rva .LSEH_begin_rsaz_1024_sqr_avx2
1911 .rva .LSEH_end_rsaz_1024_sqr_avx2
1912 .rva .LSEH_info_rsaz_1024_sqr_avx2
1913
1914 .rva .LSEH_begin_rsaz_1024_mul_avx2
1915 .rva .LSEH_end_rsaz_1024_mul_avx2
1916 .rva .LSEH_info_rsaz_1024_mul_avx2
1917
1918 .rva .LSEH_begin_rsaz_1024_gather5
1919 .rva .LSEH_end_rsaz_1024_gather5
1920 .rva .LSEH_info_rsaz_1024_gather5
1921.section .xdata
1922.align 8
1923.LSEH_info_rsaz_1024_sqr_avx2:
1924 .byte 9,0,0,0
1925 .rva rsaz_se_handler
Robert Sloan5d625782017-02-13 09:55:39 -08001926 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
1927 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08001928.LSEH_info_rsaz_1024_mul_avx2:
1929 .byte 9,0,0,0
1930 .rva rsaz_se_handler
Robert Sloan5d625782017-02-13 09:55:39 -08001931 .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
1932 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08001933.LSEH_info_rsaz_1024_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04001934 .byte 0x01,0x36,0x17,0x0b
1935 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
1936 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
1937 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
1938 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
1939 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
1940 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
1941 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
1942 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
1943 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
1944 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
1945 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
1946 .byte 0x00,0xb3,0x00,0x00 # set_frame r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001947___
1948}
1949
1950foreach (split("\n",$code)) {
1951 s/\`([^\`]*)\`/eval($1)/ge;
1952
1953 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1954
1955 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1956 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1957 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1958 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1959 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1960 print $_,"\n";
1961}
1962
1963}}} else {{{
1964print <<___; # assembler is too old
1965.text
1966
1967.globl rsaz_avx2_eligible
1968.type rsaz_avx2_eligible,\@abi-omnipotent
1969rsaz_avx2_eligible:
1970 xor %eax,%eax
1971 ret
1972.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1973
1974.globl rsaz_1024_sqr_avx2
1975.globl rsaz_1024_mul_avx2
1976.globl rsaz_1024_norm2red_avx2
1977.globl rsaz_1024_red2norm_avx2
1978.globl rsaz_1024_scatter5_avx2
1979.globl rsaz_1024_gather5_avx2
1980.type rsaz_1024_sqr_avx2,\@abi-omnipotent
1981rsaz_1024_sqr_avx2:
1982rsaz_1024_mul_avx2:
1983rsaz_1024_norm2red_avx2:
1984rsaz_1024_red2norm_avx2:
1985rsaz_1024_scatter5_avx2:
1986rsaz_1024_gather5_avx2:
1987 .byte 0x0f,0x0b # ud2
1988 ret
1989.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1990___
1991}}}
1992
1993close STDOUT;