blob: 32c21673a2fc319b2790820adb878096a1188d72 [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3##############################################################################
4# #
5# Copyright (c) 2012, Intel Corporation #
6# #
7# All rights reserved. #
8# #
9# Redistribution and use in source and binary forms, with or without #
10# modification, are permitted provided that the following conditions are #
11# met: #
12# #
13# * Redistributions of source code must retain the above copyright #
14# notice, this list of conditions and the following disclaimer. #
15# #
16# * Redistributions in binary form must reproduce the above copyright #
17# notice, this list of conditions and the following disclaimer in the #
18# documentation and/or other materials provided with the #
19# distribution. #
20# #
21# * Neither the name of the Intel Corporation nor the names of its #
22# contributors may be used to endorse or promote products derived from #
23# this software without specific prior written permission. #
24# #
25# #
26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37# #
38##############################################################################
39# Developers and authors: #
40# Shay Gueron (1, 2), and Vlad Krasnov (1) #
41# (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42# (2) University of Haifa, Israel #
43##############################################################################
44# Reference: #
45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46# Exponentiation, Using Advanced Vector Instructions Architectures", #
47# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49# [2] S. Gueron: "Efficient Software Implementations of Modular #
50# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52# Proceedings of 9th International Conference on Information Technology: #
53# New Generations (ITNG 2012), pp.821-823 (2012) #
54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55# resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56# on AVX2 capable x86_64 platforms", #
57# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58##############################################################################
59#
60# +13% improvement over original submission by <appro@openssl.org>
61#
62# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63# 2.3GHz Haswell 621 765/+23% 1113/+79%
64# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
65#
66# (*) if system doesn't support AVX2, for reference purposes;
67# (**) scaled to 2.3GHz to simplify comparison;
68# (***) scalar AD*X code is faster than AVX2 and is preferred code
69# path for Broadwell;
70
71$flavour = shift;
72$output = shift;
73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
Robert Sloan8ff03552017-06-14 12:40:58 -070079( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
Adam Langleyd9e397b2015-01-22 14:27:53 -080080die "can't locate x86_64-xlate.pl";
81
Kenny Roote99801b2015-11-06 15:31:15 -080082# In upstream, this is controlled by shelling out to the compiler to check
83# versions, but BoringSSL is intended to be used with pre-generated perlasm
84# output, so this isn't useful anyway.
85#
Robert Sloan8f860b12017-08-28 07:37:06 -070086# TODO(davidben): Set $addx to one once build problems are resolved.
Robert Sloan1c9db532017-03-13 08:03:59 -070087$avx = 2;
Robert Sloan8f860b12017-08-28 07:37:06 -070088$addx = 0;
Adam Langleyd9e397b2015-01-22 14:27:53 -080089
David Benjaminc895d6b2016-08-11 13:26:41 -040090open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
Adam Langleyd9e397b2015-01-22 14:27:53 -080091*STDOUT = *OUT;
92
93if ($avx>1) {{{
94{ # void AMS_WW(
95my $rp="%rdi"; # BN_ULONG *rp,
96my $ap="%rsi"; # const BN_ULONG *ap,
97my $np="%rdx"; # const BN_ULONG *np,
98my $n0="%ecx"; # const BN_ULONG n0,
99my $rep="%r8d"; # int repeat);
100
101# The registers that hold the accumulated redundant result
102# The AMM works on 1024 bit operands, and redundant word size is 29
103# Therefore: ceil(1024/29)/4 = 9
104my $ACC0="%ymm0";
105my $ACC1="%ymm1";
106my $ACC2="%ymm2";
107my $ACC3="%ymm3";
108my $ACC4="%ymm4";
109my $ACC5="%ymm5";
110my $ACC6="%ymm6";
111my $ACC7="%ymm7";
112my $ACC8="%ymm8";
113my $ACC9="%ymm9";
114# Registers that hold the broadcasted words of bp, currently used
115my $B1="%ymm10";
116my $B2="%ymm11";
117# Registers that hold the broadcasted words of Y, currently used
118my $Y1="%ymm12";
119my $Y2="%ymm13";
120# Helper registers
121my $TEMP1="%ymm14";
122my $AND_MASK="%ymm15";
123# alu registers that hold the first words of the ACC
124my $r0="%r9";
125my $r1="%r10";
126my $r2="%r11";
127my $r3="%r12";
128
129my $i="%r14d"; # loop counter
130my $tmp = "%r15";
131
132my $FrameSize=32*18+32*8; # place for A^2 and 2*A
133
134my $aap=$r0;
135my $tp0="%rbx";
136my $tp1=$r3;
137my $tpa=$tmp;
138
139$np="%r13"; # reassigned argument
140
141$code.=<<___;
142.text
143
144.globl rsaz_1024_sqr_avx2
145.type rsaz_1024_sqr_avx2,\@function,5
146.align 64
147rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
Robert Sloana94fe052017-02-21 08:49:28 -0800148.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800149 lea (%rsp), %rax
Robert Sloana94fe052017-02-21 08:49:28 -0800150.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800151 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800152.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800153 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800154.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800155 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800156.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800157 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800158.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800159 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800160.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800161 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800162.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800163 vzeroupper
164___
165$code.=<<___ if ($win64);
166 lea -0xa8(%rsp),%rsp
167 vmovaps %xmm6,-0xd8(%rax)
168 vmovaps %xmm7,-0xc8(%rax)
169 vmovaps %xmm8,-0xb8(%rax)
170 vmovaps %xmm9,-0xa8(%rax)
171 vmovaps %xmm10,-0x98(%rax)
172 vmovaps %xmm11,-0x88(%rax)
173 vmovaps %xmm12,-0x78(%rax)
174 vmovaps %xmm13,-0x68(%rax)
175 vmovaps %xmm14,-0x58(%rax)
176 vmovaps %xmm15,-0x48(%rax)
177.Lsqr_1024_body:
178___
179$code.=<<___;
180 mov %rax,%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800181.cfi_def_cfa_register %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800182 mov %rdx, $np # reassigned argument
183 sub \$$FrameSize, %rsp
184 mov $np, $tmp
185 sub \$-128, $rp # size optimization
186 sub \$-128, $ap
187 sub \$-128, $np
188
189 and \$4095, $tmp # see if $np crosses page
190 add \$32*10, $tmp
191 shr \$12, $tmp
192 vpxor $ACC9,$ACC9,$ACC9
193 jz .Lsqr_1024_no_n_copy
194
195 # unaligned 256-bit load that crosses page boundary can
196 # cause >2x performance degradation here, so if $np does
197 # cross page boundary, copy it to stack and make sure stack
198 # frame doesn't...
199 sub \$32*10,%rsp
200 vmovdqu 32*0-128($np), $ACC0
201 and \$-2048, %rsp
202 vmovdqu 32*1-128($np), $ACC1
203 vmovdqu 32*2-128($np), $ACC2
204 vmovdqu 32*3-128($np), $ACC3
205 vmovdqu 32*4-128($np), $ACC4
206 vmovdqu 32*5-128($np), $ACC5
207 vmovdqu 32*6-128($np), $ACC6
208 vmovdqu 32*7-128($np), $ACC7
209 vmovdqu 32*8-128($np), $ACC8
210 lea $FrameSize+128(%rsp),$np
211 vmovdqu $ACC0, 32*0-128($np)
212 vmovdqu $ACC1, 32*1-128($np)
213 vmovdqu $ACC2, 32*2-128($np)
214 vmovdqu $ACC3, 32*3-128($np)
215 vmovdqu $ACC4, 32*4-128($np)
216 vmovdqu $ACC5, 32*5-128($np)
217 vmovdqu $ACC6, 32*6-128($np)
218 vmovdqu $ACC7, 32*7-128($np)
219 vmovdqu $ACC8, 32*8-128($np)
220 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
221
222.Lsqr_1024_no_n_copy:
223 and \$-1024, %rsp
224
225 vmovdqu 32*1-128($ap), $ACC1
226 vmovdqu 32*2-128($ap), $ACC2
227 vmovdqu 32*3-128($ap), $ACC3
228 vmovdqu 32*4-128($ap), $ACC4
229 vmovdqu 32*5-128($ap), $ACC5
230 vmovdqu 32*6-128($ap), $ACC6
231 vmovdqu 32*7-128($ap), $ACC7
232 vmovdqu 32*8-128($ap), $ACC8
233
234 lea 192(%rsp), $tp0 # 64+128=192
Robert Sloancd79cde2017-12-11 09:06:12 -0800235 vmovdqu .Land_mask(%rip), $AND_MASK
Adam Langleyd9e397b2015-01-22 14:27:53 -0800236 jmp .LOOP_GRANDE_SQR_1024
237
238.align 32
239.LOOP_GRANDE_SQR_1024:
240 lea 32*18+128(%rsp), $aap # size optimization
241 lea 448(%rsp), $tp1 # 64+128+256=448
242
243 # the squaring is performed as described in Variant B of
244 # "Speeding up Big-Number Squaring", so start by calculating
245 # the A*2=A+A vector
246 vpaddq $ACC1, $ACC1, $ACC1
247 vpbroadcastq 32*0-128($ap), $B1
248 vpaddq $ACC2, $ACC2, $ACC2
249 vmovdqa $ACC1, 32*0-128($aap)
250 vpaddq $ACC3, $ACC3, $ACC3
251 vmovdqa $ACC2, 32*1-128($aap)
252 vpaddq $ACC4, $ACC4, $ACC4
253 vmovdqa $ACC3, 32*2-128($aap)
254 vpaddq $ACC5, $ACC5, $ACC5
255 vmovdqa $ACC4, 32*3-128($aap)
256 vpaddq $ACC6, $ACC6, $ACC6
257 vmovdqa $ACC5, 32*4-128($aap)
258 vpaddq $ACC7, $ACC7, $ACC7
259 vmovdqa $ACC6, 32*5-128($aap)
260 vpaddq $ACC8, $ACC8, $ACC8
261 vmovdqa $ACC7, 32*6-128($aap)
262 vpxor $ACC9, $ACC9, $ACC9
263 vmovdqa $ACC8, 32*7-128($aap)
264
265 vpmuludq 32*0-128($ap), $B1, $ACC0
266 vpbroadcastq 32*1-128($ap), $B2
267 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
268 vpmuludq $B1, $ACC1, $ACC1
269 vmovdqu $ACC9, 32*10-448($tp1)
270 vpmuludq $B1, $ACC2, $ACC2
271 vmovdqu $ACC9, 32*11-448($tp1)
272 vpmuludq $B1, $ACC3, $ACC3
273 vmovdqu $ACC9, 32*12-448($tp1)
274 vpmuludq $B1, $ACC4, $ACC4
275 vmovdqu $ACC9, 32*13-448($tp1)
276 vpmuludq $B1, $ACC5, $ACC5
277 vmovdqu $ACC9, 32*14-448($tp1)
278 vpmuludq $B1, $ACC6, $ACC6
279 vmovdqu $ACC9, 32*15-448($tp1)
280 vpmuludq $B1, $ACC7, $ACC7
281 vmovdqu $ACC9, 32*16-448($tp1)
282 vpmuludq $B1, $ACC8, $ACC8
283 vpbroadcastq 32*2-128($ap), $B1
284 vmovdqu $ACC9, 32*17-448($tp1)
285
286 mov $ap, $tpa
287 mov \$4, $i
288 jmp .Lsqr_entry_1024
289___
290$TEMP0=$Y1;
291$TEMP2=$Y2;
292$code.=<<___;
293.align 32
294.LOOP_SQR_1024:
295 vpbroadcastq 32*1-128($tpa), $B2
296 vpmuludq 32*0-128($ap), $B1, $ACC0
297 vpaddq 32*0-192($tp0), $ACC0, $ACC0
298 vpmuludq 32*0-128($aap), $B1, $ACC1
299 vpaddq 32*1-192($tp0), $ACC1, $ACC1
300 vpmuludq 32*1-128($aap), $B1, $ACC2
301 vpaddq 32*2-192($tp0), $ACC2, $ACC2
302 vpmuludq 32*2-128($aap), $B1, $ACC3
303 vpaddq 32*3-192($tp0), $ACC3, $ACC3
304 vpmuludq 32*3-128($aap), $B1, $ACC4
305 vpaddq 32*4-192($tp0), $ACC4, $ACC4
306 vpmuludq 32*4-128($aap), $B1, $ACC5
307 vpaddq 32*5-192($tp0), $ACC5, $ACC5
308 vpmuludq 32*5-128($aap), $B1, $ACC6
309 vpaddq 32*6-192($tp0), $ACC6, $ACC6
310 vpmuludq 32*6-128($aap), $B1, $ACC7
311 vpaddq 32*7-192($tp0), $ACC7, $ACC7
312 vpmuludq 32*7-128($aap), $B1, $ACC8
313 vpbroadcastq 32*2-128($tpa), $B1
314 vpaddq 32*8-192($tp0), $ACC8, $ACC8
315.Lsqr_entry_1024:
316 vmovdqu $ACC0, 32*0-192($tp0)
317 vmovdqu $ACC1, 32*1-192($tp0)
318
319 vpmuludq 32*1-128($ap), $B2, $TEMP0
320 vpaddq $TEMP0, $ACC2, $ACC2
321 vpmuludq 32*1-128($aap), $B2, $TEMP1
322 vpaddq $TEMP1, $ACC3, $ACC3
323 vpmuludq 32*2-128($aap), $B2, $TEMP2
324 vpaddq $TEMP2, $ACC4, $ACC4
325 vpmuludq 32*3-128($aap), $B2, $TEMP0
326 vpaddq $TEMP0, $ACC5, $ACC5
327 vpmuludq 32*4-128($aap), $B2, $TEMP1
328 vpaddq $TEMP1, $ACC6, $ACC6
329 vpmuludq 32*5-128($aap), $B2, $TEMP2
330 vpaddq $TEMP2, $ACC7, $ACC7
331 vpmuludq 32*6-128($aap), $B2, $TEMP0
332 vpaddq $TEMP0, $ACC8, $ACC8
333 vpmuludq 32*7-128($aap), $B2, $ACC0
334 vpbroadcastq 32*3-128($tpa), $B2
335 vpaddq 32*9-192($tp0), $ACC0, $ACC0
336
337 vmovdqu $ACC2, 32*2-192($tp0)
338 vmovdqu $ACC3, 32*3-192($tp0)
339
340 vpmuludq 32*2-128($ap), $B1, $TEMP2
341 vpaddq $TEMP2, $ACC4, $ACC4
342 vpmuludq 32*2-128($aap), $B1, $TEMP0
343 vpaddq $TEMP0, $ACC5, $ACC5
344 vpmuludq 32*3-128($aap), $B1, $TEMP1
345 vpaddq $TEMP1, $ACC6, $ACC6
346 vpmuludq 32*4-128($aap), $B1, $TEMP2
347 vpaddq $TEMP2, $ACC7, $ACC7
348 vpmuludq 32*5-128($aap), $B1, $TEMP0
349 vpaddq $TEMP0, $ACC8, $ACC8
350 vpmuludq 32*6-128($aap), $B1, $TEMP1
351 vpaddq $TEMP1, $ACC0, $ACC0
352 vpmuludq 32*7-128($aap), $B1, $ACC1
353 vpbroadcastq 32*4-128($tpa), $B1
354 vpaddq 32*10-448($tp1), $ACC1, $ACC1
355
356 vmovdqu $ACC4, 32*4-192($tp0)
357 vmovdqu $ACC5, 32*5-192($tp0)
358
359 vpmuludq 32*3-128($ap), $B2, $TEMP0
360 vpaddq $TEMP0, $ACC6, $ACC6
361 vpmuludq 32*3-128($aap), $B2, $TEMP1
362 vpaddq $TEMP1, $ACC7, $ACC7
363 vpmuludq 32*4-128($aap), $B2, $TEMP2
364 vpaddq $TEMP2, $ACC8, $ACC8
365 vpmuludq 32*5-128($aap), $B2, $TEMP0
366 vpaddq $TEMP0, $ACC0, $ACC0
367 vpmuludq 32*6-128($aap), $B2, $TEMP1
368 vpaddq $TEMP1, $ACC1, $ACC1
369 vpmuludq 32*7-128($aap), $B2, $ACC2
370 vpbroadcastq 32*5-128($tpa), $B2
Robert Sloana94fe052017-02-21 08:49:28 -0800371 vpaddq 32*11-448($tp1), $ACC2, $ACC2
Adam Langleyd9e397b2015-01-22 14:27:53 -0800372
373 vmovdqu $ACC6, 32*6-192($tp0)
374 vmovdqu $ACC7, 32*7-192($tp0)
375
376 vpmuludq 32*4-128($ap), $B1, $TEMP0
377 vpaddq $TEMP0, $ACC8, $ACC8
378 vpmuludq 32*4-128($aap), $B1, $TEMP1
379 vpaddq $TEMP1, $ACC0, $ACC0
380 vpmuludq 32*5-128($aap), $B1, $TEMP2
381 vpaddq $TEMP2, $ACC1, $ACC1
382 vpmuludq 32*6-128($aap), $B1, $TEMP0
383 vpaddq $TEMP0, $ACC2, $ACC2
384 vpmuludq 32*7-128($aap), $B1, $ACC3
385 vpbroadcastq 32*6-128($tpa), $B1
386 vpaddq 32*12-448($tp1), $ACC3, $ACC3
387
388 vmovdqu $ACC8, 32*8-192($tp0)
389 vmovdqu $ACC0, 32*9-192($tp0)
390 lea 8($tp0), $tp0
391
392 vpmuludq 32*5-128($ap), $B2, $TEMP2
393 vpaddq $TEMP2, $ACC1, $ACC1
394 vpmuludq 32*5-128($aap), $B2, $TEMP0
395 vpaddq $TEMP0, $ACC2, $ACC2
396 vpmuludq 32*6-128($aap), $B2, $TEMP1
397 vpaddq $TEMP1, $ACC3, $ACC3
398 vpmuludq 32*7-128($aap), $B2, $ACC4
399 vpbroadcastq 32*7-128($tpa), $B2
400 vpaddq 32*13-448($tp1), $ACC4, $ACC4
401
402 vmovdqu $ACC1, 32*10-448($tp1)
403 vmovdqu $ACC2, 32*11-448($tp1)
404
405 vpmuludq 32*6-128($ap), $B1, $TEMP0
406 vpaddq $TEMP0, $ACC3, $ACC3
407 vpmuludq 32*6-128($aap), $B1, $TEMP1
408 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
409 vpaddq $TEMP1, $ACC4, $ACC4
410 vpmuludq 32*7-128($aap), $B1, $ACC5
411 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
412 vpaddq 32*14-448($tp1), $ACC5, $ACC5
413
414 vmovdqu $ACC3, 32*12-448($tp1)
415 vmovdqu $ACC4, 32*13-448($tp1)
416 lea 8($tpa), $tpa
417
418 vpmuludq 32*7-128($ap), $B2, $TEMP0
419 vpaddq $TEMP0, $ACC5, $ACC5
420 vpmuludq 32*7-128($aap), $B2, $ACC6
421 vpaddq 32*15-448($tp1), $ACC6, $ACC6
422
423 vpmuludq 32*8-128($ap), $ACC0, $ACC7
424 vmovdqu $ACC5, 32*14-448($tp1)
425 vpaddq 32*16-448($tp1), $ACC7, $ACC7
426 vmovdqu $ACC6, 32*15-448($tp1)
427 vmovdqu $ACC7, 32*16-448($tp1)
428 lea 8($tp1), $tp1
429
Robert Sloana94fe052017-02-21 08:49:28 -0800430 dec $i
Adam Langleyd9e397b2015-01-22 14:27:53 -0800431 jnz .LOOP_SQR_1024
432___
433$ZERO = $ACC9;
434$TEMP0 = $B1;
435$TEMP2 = $B2;
436$TEMP3 = $Y1;
437$TEMP4 = $Y2;
438$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -0400439 # we need to fix indices 32-39 to avoid overflow
Adam Langleyd9e397b2015-01-22 14:27:53 -0800440 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
441 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
442 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
443 lea 192(%rsp), $tp0 # 64+128=192
444
445 vpsrlq \$29, $ACC8, $TEMP1
446 vpand $AND_MASK, $ACC8, $ACC8
447 vpsrlq \$29, $ACC1, $TEMP2
448 vpand $AND_MASK, $ACC1, $ACC1
449
450 vpermq \$0x93, $TEMP1, $TEMP1
451 vpxor $ZERO, $ZERO, $ZERO
452 vpermq \$0x93, $TEMP2, $TEMP2
453
454 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
455 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
456 vpaddq $TEMP0, $ACC8, $ACC8
457 vpblendd \$3, $TEMP2, $ZERO, $TEMP2
458 vpaddq $TEMP1, $ACC1, $ACC1
459 vpaddq $TEMP2, $ACC2, $ACC2
460 vmovdqu $ACC1, 32*9-192($tp0)
461 vmovdqu $ACC2, 32*10-192($tp0)
462
463 mov (%rsp), %rax
464 mov 8(%rsp), $r1
465 mov 16(%rsp), $r2
466 mov 24(%rsp), $r3
467 vmovdqu 32*1(%rsp), $ACC1
468 vmovdqu 32*2-192($tp0), $ACC2
469 vmovdqu 32*3-192($tp0), $ACC3
470 vmovdqu 32*4-192($tp0), $ACC4
471 vmovdqu 32*5-192($tp0), $ACC5
472 vmovdqu 32*6-192($tp0), $ACC6
473 vmovdqu 32*7-192($tp0), $ACC7
474
475 mov %rax, $r0
476 imull $n0, %eax
477 and \$0x1fffffff, %eax
478 vmovd %eax, $Y1
479
480 mov %rax, %rdx
481 imulq -128($np), %rax
482 vpbroadcastq $Y1, $Y1
483 add %rax, $r0
484 mov %rdx, %rax
485 imulq 8-128($np), %rax
486 shr \$29, $r0
487 add %rax, $r1
488 mov %rdx, %rax
489 imulq 16-128($np), %rax
490 add $r0, $r1
491 add %rax, $r2
492 imulq 24-128($np), %rdx
493 add %rdx, $r3
494
495 mov $r1, %rax
496 imull $n0, %eax
497 and \$0x1fffffff, %eax
498
499 mov \$9, $i
500 jmp .LOOP_REDUCE_1024
501
502.align 32
503.LOOP_REDUCE_1024:
504 vmovd %eax, $Y2
505 vpbroadcastq $Y2, $Y2
506
507 vpmuludq 32*1-128($np), $Y1, $TEMP0
508 mov %rax, %rdx
509 imulq -128($np), %rax
510 vpaddq $TEMP0, $ACC1, $ACC1
511 add %rax, $r1
512 vpmuludq 32*2-128($np), $Y1, $TEMP1
513 mov %rdx, %rax
514 imulq 8-128($np), %rax
515 vpaddq $TEMP1, $ACC2, $ACC2
516 vpmuludq 32*3-128($np), $Y1, $TEMP2
517 .byte 0x67
518 add %rax, $r2
519 .byte 0x67
520 mov %rdx, %rax
521 imulq 16-128($np), %rax
522 shr \$29, $r1
523 vpaddq $TEMP2, $ACC3, $ACC3
524 vpmuludq 32*4-128($np), $Y1, $TEMP0
525 add %rax, $r3
526 add $r1, $r2
527 vpaddq $TEMP0, $ACC4, $ACC4
528 vpmuludq 32*5-128($np), $Y1, $TEMP1
529 mov $r2, %rax
530 imull $n0, %eax
531 vpaddq $TEMP1, $ACC5, $ACC5
532 vpmuludq 32*6-128($np), $Y1, $TEMP2
533 and \$0x1fffffff, %eax
534 vpaddq $TEMP2, $ACC6, $ACC6
535 vpmuludq 32*7-128($np), $Y1, $TEMP0
536 vpaddq $TEMP0, $ACC7, $ACC7
537 vpmuludq 32*8-128($np), $Y1, $TEMP1
538 vmovd %eax, $Y1
539 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
540 vpaddq $TEMP1, $ACC8, $ACC8
541 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
542 vpbroadcastq $Y1, $Y1
543
544 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
545 vmovdqu 32*3-8-128($np), $TEMP1
546 mov %rax, %rdx
547 imulq -128($np), %rax
548 vpaddq $TEMP2, $ACC1, $ACC1
549 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
550 vmovdqu 32*4-8-128($np), $TEMP2
551 add %rax, $r2
552 mov %rdx, %rax
553 imulq 8-128($np), %rax
554 vpaddq $TEMP0, $ACC2, $ACC2
555 add $r3, %rax
556 shr \$29, $r2
557 vpmuludq $Y2, $TEMP1, $TEMP1
558 vmovdqu 32*5-8-128($np), $TEMP0
559 add $r2, %rax
560 vpaddq $TEMP1, $ACC3, $ACC3
561 vpmuludq $Y2, $TEMP2, $TEMP2
562 vmovdqu 32*6-8-128($np), $TEMP1
563 .byte 0x67
564 mov %rax, $r3
565 imull $n0, %eax
566 vpaddq $TEMP2, $ACC4, $ACC4
567 vpmuludq $Y2, $TEMP0, $TEMP0
568 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
569 and \$0x1fffffff, %eax
570 vpaddq $TEMP0, $ACC5, $ACC5
571 vpmuludq $Y2, $TEMP1, $TEMP1
572 vmovdqu 32*8-8-128($np), $TEMP0
573 vpaddq $TEMP1, $ACC6, $ACC6
574 vpmuludq $Y2, $TEMP2, $TEMP2
575 vmovdqu 32*9-8-128($np), $ACC9
576 vmovd %eax, $ACC0 # borrow ACC0 for Y2
577 imulq -128($np), %rax
578 vpaddq $TEMP2, $ACC7, $ACC7
579 vpmuludq $Y2, $TEMP0, $TEMP0
580 vmovdqu 32*1-16-128($np), $TEMP1
581 vpbroadcastq $ACC0, $ACC0
582 vpaddq $TEMP0, $ACC8, $ACC8
583 vpmuludq $Y2, $ACC9, $ACC9
584 vmovdqu 32*2-16-128($np), $TEMP2
585 add %rax, $r3
586
587___
588($ACC0,$Y2)=($Y2,$ACC0);
589$code.=<<___;
590 vmovdqu 32*1-24-128($np), $ACC0
591 vpmuludq $Y1, $TEMP1, $TEMP1
592 vmovdqu 32*3-16-128($np), $TEMP0
593 vpaddq $TEMP1, $ACC1, $ACC1
594 vpmuludq $Y2, $ACC0, $ACC0
595 vpmuludq $Y1, $TEMP2, $TEMP2
596 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
597 vpaddq $ACC1, $ACC0, $ACC0
598 vpaddq $TEMP2, $ACC2, $ACC2
599 vpmuludq $Y1, $TEMP0, $TEMP0
600 vmovdqu 32*5-16-128($np), $TEMP2
601 .byte 0x67
602 vmovq $ACC0, %rax
603 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
604 vpaddq $TEMP0, $ACC3, $ACC3
605 vpmuludq $Y1, $TEMP1, $TEMP1
606 vmovdqu 32*6-16-128($np), $TEMP0
607 vpaddq $TEMP1, $ACC4, $ACC4
608 vpmuludq $Y1, $TEMP2, $TEMP2
609 vmovdqu 32*7-16-128($np), $TEMP1
610 vpaddq $TEMP2, $ACC5, $ACC5
611 vpmuludq $Y1, $TEMP0, $TEMP0
612 vmovdqu 32*8-16-128($np), $TEMP2
613 vpaddq $TEMP0, $ACC6, $ACC6
614 vpmuludq $Y1, $TEMP1, $TEMP1
615 shr \$29, $r3
616 vmovdqu 32*9-16-128($np), $TEMP0
617 add $r3, %rax
618 vpaddq $TEMP1, $ACC7, $ACC7
619 vpmuludq $Y1, $TEMP2, $TEMP2
620 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
621 mov %rax, $r0
622 imull $n0, %eax
623 vpaddq $TEMP2, $ACC8, $ACC8
624 vpmuludq $Y1, $TEMP0, $TEMP0
625 and \$0x1fffffff, %eax
626 vmovd %eax, $Y1
627 vmovdqu 32*3-24-128($np), $TEMP2
628 .byte 0x67
629 vpaddq $TEMP0, $ACC9, $ACC9
630 vpbroadcastq $Y1, $Y1
631
632 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
633 vmovdqu 32*4-24-128($np), $TEMP0
634 mov %rax, %rdx
635 imulq -128($np), %rax
636 mov 8(%rsp), $r1
637 vpaddq $TEMP1, $ACC2, $ACC1
638 vpmuludq $Y2, $TEMP2, $TEMP2
639 vmovdqu 32*5-24-128($np), $TEMP1
640 add %rax, $r0
641 mov %rdx, %rax
642 imulq 8-128($np), %rax
643 .byte 0x67
644 shr \$29, $r0
645 mov 16(%rsp), $r2
646 vpaddq $TEMP2, $ACC3, $ACC2
647 vpmuludq $Y2, $TEMP0, $TEMP0
648 vmovdqu 32*6-24-128($np), $TEMP2
649 add %rax, $r1
650 mov %rdx, %rax
651 imulq 16-128($np), %rax
652 vpaddq $TEMP0, $ACC4, $ACC3
653 vpmuludq $Y2, $TEMP1, $TEMP1
654 vmovdqu 32*7-24-128($np), $TEMP0
655 imulq 24-128($np), %rdx # future $r3
656 add %rax, $r2
657 lea ($r0,$r1), %rax
658 vpaddq $TEMP1, $ACC5, $ACC4
659 vpmuludq $Y2, $TEMP2, $TEMP2
660 vmovdqu 32*8-24-128($np), $TEMP1
661 mov %rax, $r1
662 imull $n0, %eax
663 vpmuludq $Y2, $TEMP0, $TEMP0
664 vpaddq $TEMP2, $ACC6, $ACC5
665 vmovdqu 32*9-24-128($np), $TEMP2
666 and \$0x1fffffff, %eax
667 vpaddq $TEMP0, $ACC7, $ACC6
668 vpmuludq $Y2, $TEMP1, $TEMP1
669 add 24(%rsp), %rdx
670 vpaddq $TEMP1, $ACC8, $ACC7
671 vpmuludq $Y2, $TEMP2, $TEMP2
672 vpaddq $TEMP2, $ACC9, $ACC8
673 vmovq $r3, $ACC9
674 mov %rdx, $r3
675
676 dec $i
677 jnz .LOOP_REDUCE_1024
678___
679($ACC0,$Y2)=($Y2,$ACC0);
680$code.=<<___;
681 lea 448(%rsp), $tp1 # size optimization
682 vpaddq $ACC9, $Y2, $ACC0
683 vpxor $ZERO, $ZERO, $ZERO
684
685 vpaddq 32*9-192($tp0), $ACC0, $ACC0
686 vpaddq 32*10-448($tp1), $ACC1, $ACC1
687 vpaddq 32*11-448($tp1), $ACC2, $ACC2
688 vpaddq 32*12-448($tp1), $ACC3, $ACC3
689 vpaddq 32*13-448($tp1), $ACC4, $ACC4
690 vpaddq 32*14-448($tp1), $ACC5, $ACC5
691 vpaddq 32*15-448($tp1), $ACC6, $ACC6
692 vpaddq 32*16-448($tp1), $ACC7, $ACC7
693 vpaddq 32*17-448($tp1), $ACC8, $ACC8
694
695 vpsrlq \$29, $ACC0, $TEMP1
696 vpand $AND_MASK, $ACC0, $ACC0
697 vpsrlq \$29, $ACC1, $TEMP2
698 vpand $AND_MASK, $ACC1, $ACC1
699 vpsrlq \$29, $ACC2, $TEMP3
700 vpermq \$0x93, $TEMP1, $TEMP1
701 vpand $AND_MASK, $ACC2, $ACC2
702 vpsrlq \$29, $ACC3, $TEMP4
703 vpermq \$0x93, $TEMP2, $TEMP2
704 vpand $AND_MASK, $ACC3, $ACC3
705 vpermq \$0x93, $TEMP3, $TEMP3
706
707 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
708 vpermq \$0x93, $TEMP4, $TEMP4
709 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
710 vpaddq $TEMP0, $ACC0, $ACC0
711 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
712 vpaddq $TEMP1, $ACC1, $ACC1
713 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
714 vpaddq $TEMP2, $ACC2, $ACC2
715 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
716 vpaddq $TEMP3, $ACC3, $ACC3
717 vpaddq $TEMP4, $ACC4, $ACC4
718
719 vpsrlq \$29, $ACC0, $TEMP1
720 vpand $AND_MASK, $ACC0, $ACC0
721 vpsrlq \$29, $ACC1, $TEMP2
722 vpand $AND_MASK, $ACC1, $ACC1
723 vpsrlq \$29, $ACC2, $TEMP3
724 vpermq \$0x93, $TEMP1, $TEMP1
725 vpand $AND_MASK, $ACC2, $ACC2
726 vpsrlq \$29, $ACC3, $TEMP4
727 vpermq \$0x93, $TEMP2, $TEMP2
728 vpand $AND_MASK, $ACC3, $ACC3
729 vpermq \$0x93, $TEMP3, $TEMP3
730
731 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
732 vpermq \$0x93, $TEMP4, $TEMP4
733 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
734 vpaddq $TEMP0, $ACC0, $ACC0
735 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
736 vpaddq $TEMP1, $ACC1, $ACC1
737 vmovdqu $ACC0, 32*0-128($rp)
738 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
739 vpaddq $TEMP2, $ACC2, $ACC2
740 vmovdqu $ACC1, 32*1-128($rp)
741 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
742 vpaddq $TEMP3, $ACC3, $ACC3
743 vmovdqu $ACC2, 32*2-128($rp)
744 vpaddq $TEMP4, $ACC4, $ACC4
745 vmovdqu $ACC3, 32*3-128($rp)
746___
747$TEMP5=$ACC0;
748$code.=<<___;
749 vpsrlq \$29, $ACC4, $TEMP1
750 vpand $AND_MASK, $ACC4, $ACC4
751 vpsrlq \$29, $ACC5, $TEMP2
752 vpand $AND_MASK, $ACC5, $ACC5
753 vpsrlq \$29, $ACC6, $TEMP3
754 vpermq \$0x93, $TEMP1, $TEMP1
755 vpand $AND_MASK, $ACC6, $ACC6
756 vpsrlq \$29, $ACC7, $TEMP4
757 vpermq \$0x93, $TEMP2, $TEMP2
758 vpand $AND_MASK, $ACC7, $ACC7
759 vpsrlq \$29, $ACC8, $TEMP5
760 vpermq \$0x93, $TEMP3, $TEMP3
761 vpand $AND_MASK, $ACC8, $ACC8
762 vpermq \$0x93, $TEMP4, $TEMP4
763
764 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
765 vpermq \$0x93, $TEMP5, $TEMP5
766 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
767 vpaddq $TEMP0, $ACC4, $ACC4
768 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
769 vpaddq $TEMP1, $ACC5, $ACC5
770 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
771 vpaddq $TEMP2, $ACC6, $ACC6
772 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
773 vpaddq $TEMP3, $ACC7, $ACC7
774 vpaddq $TEMP4, $ACC8, $ACC8
Robert Sloana94fe052017-02-21 08:49:28 -0800775
Adam Langleyd9e397b2015-01-22 14:27:53 -0800776 vpsrlq \$29, $ACC4, $TEMP1
777 vpand $AND_MASK, $ACC4, $ACC4
778 vpsrlq \$29, $ACC5, $TEMP2
779 vpand $AND_MASK, $ACC5, $ACC5
780 vpsrlq \$29, $ACC6, $TEMP3
781 vpermq \$0x93, $TEMP1, $TEMP1
782 vpand $AND_MASK, $ACC6, $ACC6
783 vpsrlq \$29, $ACC7, $TEMP4
784 vpermq \$0x93, $TEMP2, $TEMP2
785 vpand $AND_MASK, $ACC7, $ACC7
786 vpsrlq \$29, $ACC8, $TEMP5
787 vpermq \$0x93, $TEMP3, $TEMP3
788 vpand $AND_MASK, $ACC8, $ACC8
789 vpermq \$0x93, $TEMP4, $TEMP4
790
791 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
792 vpermq \$0x93, $TEMP5, $TEMP5
793 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
794 vpaddq $TEMP0, $ACC4, $ACC4
795 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
796 vpaddq $TEMP1, $ACC5, $ACC5
797 vmovdqu $ACC4, 32*4-128($rp)
798 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
799 vpaddq $TEMP2, $ACC6, $ACC6
800 vmovdqu $ACC5, 32*5-128($rp)
801 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
802 vpaddq $TEMP3, $ACC7, $ACC7
803 vmovdqu $ACC6, 32*6-128($rp)
804 vpaddq $TEMP4, $ACC8, $ACC8
805 vmovdqu $ACC7, 32*7-128($rp)
806 vmovdqu $ACC8, 32*8-128($rp)
807
808 mov $rp, $ap
809 dec $rep
810 jne .LOOP_GRANDE_SQR_1024
811
812 vzeroall
813 mov %rbp, %rax
Robert Sloana94fe052017-02-21 08:49:28 -0800814.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800815___
816$code.=<<___ if ($win64);
Robert Sloan5d625782017-02-13 09:55:39 -0800817.Lsqr_1024_in_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -0800818 movaps -0xd8(%rax),%xmm6
819 movaps -0xc8(%rax),%xmm7
820 movaps -0xb8(%rax),%xmm8
821 movaps -0xa8(%rax),%xmm9
822 movaps -0x98(%rax),%xmm10
823 movaps -0x88(%rax),%xmm11
824 movaps -0x78(%rax),%xmm12
825 movaps -0x68(%rax),%xmm13
826 movaps -0x58(%rax),%xmm14
827 movaps -0x48(%rax),%xmm15
828___
829$code.=<<___;
830 mov -48(%rax),%r15
Robert Sloana94fe052017-02-21 08:49:28 -0800831.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800832 mov -40(%rax),%r14
Robert Sloana94fe052017-02-21 08:49:28 -0800833.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800834 mov -32(%rax),%r13
Robert Sloana94fe052017-02-21 08:49:28 -0800835.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800836 mov -24(%rax),%r12
Robert Sloana94fe052017-02-21 08:49:28 -0800837.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800838 mov -16(%rax),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800839.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800840 mov -8(%rax),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800841.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800842 lea (%rax),%rsp # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -0800843.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800844.Lsqr_1024_epilogue:
845 ret
Robert Sloana94fe052017-02-21 08:49:28 -0800846.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800847.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
848___
849}
850
851{ # void AMM_WW(
852my $rp="%rdi"; # BN_ULONG *rp,
853my $ap="%rsi"; # const BN_ULONG *ap,
854my $bp="%rdx"; # const BN_ULONG *bp,
855my $np="%rcx"; # const BN_ULONG *np,
856my $n0="%r8d"; # unsigned int n0);
857
858# The registers that hold the accumulated redundant result
859# The AMM works on 1024 bit operands, and redundant word size is 29
860# Therefore: ceil(1024/29)/4 = 9
861my $ACC0="%ymm0";
862my $ACC1="%ymm1";
863my $ACC2="%ymm2";
864my $ACC3="%ymm3";
865my $ACC4="%ymm4";
866my $ACC5="%ymm5";
867my $ACC6="%ymm6";
868my $ACC7="%ymm7";
869my $ACC8="%ymm8";
870my $ACC9="%ymm9";
871
872# Registers that hold the broadcasted words of multiplier, currently used
873my $Bi="%ymm10";
874my $Yi="%ymm11";
875
876# Helper registers
877my $TEMP0=$ACC0;
878my $TEMP1="%ymm12";
879my $TEMP2="%ymm13";
880my $ZERO="%ymm14";
881my $AND_MASK="%ymm15";
882
883# alu registers that hold the first words of the ACC
884my $r0="%r9";
885my $r1="%r10";
886my $r2="%r11";
887my $r3="%r12";
888
889my $i="%r14d";
890my $tmp="%r15";
891
892$bp="%r13"; # reassigned argument
893
894$code.=<<___;
895.globl rsaz_1024_mul_avx2
896.type rsaz_1024_mul_avx2,\@function,5
897.align 64
898rsaz_1024_mul_avx2:
Robert Sloana94fe052017-02-21 08:49:28 -0800899.cfi_startproc
Adam Langleyd9e397b2015-01-22 14:27:53 -0800900 lea (%rsp), %rax
Robert Sloana94fe052017-02-21 08:49:28 -0800901.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -0800902 push %rbx
Robert Sloana94fe052017-02-21 08:49:28 -0800903.cfi_push %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -0800904 push %rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800905.cfi_push %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800906 push %r12
Robert Sloana94fe052017-02-21 08:49:28 -0800907.cfi_push %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -0800908 push %r13
Robert Sloana94fe052017-02-21 08:49:28 -0800909.cfi_push %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -0800910 push %r14
Robert Sloana94fe052017-02-21 08:49:28 -0800911.cfi_push %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -0800912 push %r15
Robert Sloana94fe052017-02-21 08:49:28 -0800913.cfi_push %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -0800914___
915$code.=<<___ if ($win64);
916 vzeroupper
917 lea -0xa8(%rsp),%rsp
918 vmovaps %xmm6,-0xd8(%rax)
919 vmovaps %xmm7,-0xc8(%rax)
920 vmovaps %xmm8,-0xb8(%rax)
921 vmovaps %xmm9,-0xa8(%rax)
922 vmovaps %xmm10,-0x98(%rax)
923 vmovaps %xmm11,-0x88(%rax)
924 vmovaps %xmm12,-0x78(%rax)
925 vmovaps %xmm13,-0x68(%rax)
926 vmovaps %xmm14,-0x58(%rax)
927 vmovaps %xmm15,-0x48(%rax)
928.Lmul_1024_body:
929___
930$code.=<<___;
931 mov %rax,%rbp
Robert Sloana94fe052017-02-21 08:49:28 -0800932.cfi_def_cfa_register %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -0800933 vzeroall
934 mov %rdx, $bp # reassigned argument
935 sub \$64,%rsp
936
937 # unaligned 256-bit load that crosses page boundary can
938 # cause severe performance degradation here, so if $ap does
939 # cross page boundary, swap it with $bp [meaning that caller
940 # is advised to lay down $ap and $bp next to each other, so
941 # that only one can cross page boundary].
942 .byte 0x67,0x67
943 mov $ap, $tmp
944 and \$4095, $tmp
945 add \$32*10, $tmp
946 shr \$12, $tmp
947 mov $ap, $tmp
948 cmovnz $bp, $ap
949 cmovnz $tmp, $bp
950
951 mov $np, $tmp
952 sub \$-128,$ap # size optimization
953 sub \$-128,$np
954 sub \$-128,$rp
955
956 and \$4095, $tmp # see if $np crosses page
957 add \$32*10, $tmp
958 .byte 0x67,0x67
959 shr \$12, $tmp
960 jz .Lmul_1024_no_n_copy
961
962 # unaligned 256-bit load that crosses page boundary can
963 # cause severe performance degradation here, so if $np does
964 # cross page boundary, copy it to stack and make sure stack
965 # frame doesn't...
966 sub \$32*10,%rsp
967 vmovdqu 32*0-128($np), $ACC0
968 and \$-512, %rsp
969 vmovdqu 32*1-128($np), $ACC1
970 vmovdqu 32*2-128($np), $ACC2
971 vmovdqu 32*3-128($np), $ACC3
972 vmovdqu 32*4-128($np), $ACC4
973 vmovdqu 32*5-128($np), $ACC5
974 vmovdqu 32*6-128($np), $ACC6
975 vmovdqu 32*7-128($np), $ACC7
976 vmovdqu 32*8-128($np), $ACC8
977 lea 64+128(%rsp),$np
978 vmovdqu $ACC0, 32*0-128($np)
979 vpxor $ACC0, $ACC0, $ACC0
980 vmovdqu $ACC1, 32*1-128($np)
981 vpxor $ACC1, $ACC1, $ACC1
982 vmovdqu $ACC2, 32*2-128($np)
983 vpxor $ACC2, $ACC2, $ACC2
984 vmovdqu $ACC3, 32*3-128($np)
985 vpxor $ACC3, $ACC3, $ACC3
986 vmovdqu $ACC4, 32*4-128($np)
987 vpxor $ACC4, $ACC4, $ACC4
988 vmovdqu $ACC5, 32*5-128($np)
989 vpxor $ACC5, $ACC5, $ACC5
990 vmovdqu $ACC6, 32*6-128($np)
991 vpxor $ACC6, $ACC6, $ACC6
992 vmovdqu $ACC7, 32*7-128($np)
993 vpxor $ACC7, $ACC7, $ACC7
994 vmovdqu $ACC8, 32*8-128($np)
995 vmovdqa $ACC0, $ACC8
996 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
997.Lmul_1024_no_n_copy:
998 and \$-64,%rsp
999
1000 mov ($bp), %rbx
1001 vpbroadcastq ($bp), $Bi
1002 vmovdqu $ACC0, (%rsp) # clear top of stack
1003 xor $r0, $r0
1004 .byte 0x67
1005 xor $r1, $r1
1006 xor $r2, $r2
1007 xor $r3, $r3
1008
1009 vmovdqu .Land_mask(%rip), $AND_MASK
1010 mov \$9, $i
1011 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
1012 jmp .Loop_mul_1024
1013
1014.align 32
1015.Loop_mul_1024:
1016 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
1017 mov %rbx, %rax
1018 imulq -128($ap), %rax
1019 add $r0, %rax
1020 mov %rbx, $r1
1021 imulq 8-128($ap), $r1
1022 add 8(%rsp), $r1
1023
1024 mov %rax, $r0
1025 imull $n0, %eax
1026 and \$0x1fffffff, %eax
1027
1028 mov %rbx, $r2
1029 imulq 16-128($ap), $r2
1030 add 16(%rsp), $r2
1031
1032 mov %rbx, $r3
1033 imulq 24-128($ap), $r3
1034 add 24(%rsp), $r3
1035 vpmuludq 32*1-128($ap),$Bi,$TEMP0
1036 vmovd %eax, $Yi
1037 vpaddq $TEMP0,$ACC1,$ACC1
1038 vpmuludq 32*2-128($ap),$Bi,$TEMP1
1039 vpbroadcastq $Yi, $Yi
1040 vpaddq $TEMP1,$ACC2,$ACC2
1041 vpmuludq 32*3-128($ap),$Bi,$TEMP2
1042 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1043 vpaddq $TEMP2,$ACC3,$ACC3
1044 vpmuludq 32*4-128($ap),$Bi,$TEMP0
1045 vpaddq $TEMP0,$ACC4,$ACC4
1046 vpmuludq 32*5-128($ap),$Bi,$TEMP1
1047 vpaddq $TEMP1,$ACC5,$ACC5
1048 vpmuludq 32*6-128($ap),$Bi,$TEMP2
1049 vpaddq $TEMP2,$ACC6,$ACC6
1050 vpmuludq 32*7-128($ap),$Bi,$TEMP0
1051 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1052 vpaddq $TEMP0,$ACC7,$ACC7
1053 vpmuludq 32*8-128($ap),$Bi,$TEMP1
1054 vpbroadcastq 8($bp), $Bi
1055 vpaddq $TEMP1,$ACC8,$ACC8
1056
1057 mov %rax,%rdx
1058 imulq -128($np),%rax
1059 add %rax,$r0
1060 mov %rdx,%rax
1061 imulq 8-128($np),%rax
1062 add %rax,$r1
1063 mov %rdx,%rax
1064 imulq 16-128($np),%rax
1065 add %rax,$r2
1066 shr \$29, $r0
1067 imulq 24-128($np),%rdx
1068 add %rdx,$r3
1069 add $r0, $r1
1070
1071 vpmuludq 32*1-128($np),$Yi,$TEMP2
1072 vmovq $Bi, %rbx
1073 vpaddq $TEMP2,$ACC1,$ACC1
1074 vpmuludq 32*2-128($np),$Yi,$TEMP0
1075 vpaddq $TEMP0,$ACC2,$ACC2
1076 vpmuludq 32*3-128($np),$Yi,$TEMP1
1077 vpaddq $TEMP1,$ACC3,$ACC3
1078 vpmuludq 32*4-128($np),$Yi,$TEMP2
1079 vpaddq $TEMP2,$ACC4,$ACC4
1080 vpmuludq 32*5-128($np),$Yi,$TEMP0
1081 vpaddq $TEMP0,$ACC5,$ACC5
1082 vpmuludq 32*6-128($np),$Yi,$TEMP1
1083 vpaddq $TEMP1,$ACC6,$ACC6
1084 vpmuludq 32*7-128($np),$Yi,$TEMP2
Robert Sloancd79cde2017-12-11 09:06:12 -08001085 vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
Adam Langleyd9e397b2015-01-22 14:27:53 -08001086 vpaddq $TEMP2,$ACC7,$ACC7
1087 vpmuludq 32*8-128($np),$Yi,$TEMP0
Robert Sloancd79cde2017-12-11 09:06:12 -08001088 vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
Adam Langleyd9e397b2015-01-22 14:27:53 -08001089 vpaddq $TEMP0,$ACC8,$ACC8
1090
1091 mov %rbx, %rax
1092 imulq -128($ap),%rax
1093 add %rax,$r1
1094 vmovdqu -8+32*1-128($ap),$TEMP1
1095 mov %rbx, %rax
1096 imulq 8-128($ap),%rax
1097 add %rax,$r2
1098 vmovdqu -8+32*2-128($ap),$TEMP2
1099
1100 mov $r1, %rax
Robert Sloancd79cde2017-12-11 09:06:12 -08001101 vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
Adam Langleyd9e397b2015-01-22 14:27:53 -08001102 imull $n0, %eax
Robert Sloancd79cde2017-12-11 09:06:12 -08001103 vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
Adam Langleyd9e397b2015-01-22 14:27:53 -08001104 and \$0x1fffffff, %eax
1105
1106 imulq 16-128($ap),%rbx
1107 add %rbx,$r3
1108 vpmuludq $Bi,$TEMP1,$TEMP1
1109 vmovd %eax, $Yi
1110 vmovdqu -8+32*3-128($ap),$TEMP0
1111 vpaddq $TEMP1,$ACC1,$ACC1
1112 vpmuludq $Bi,$TEMP2,$TEMP2
1113 vpbroadcastq $Yi, $Yi
1114 vmovdqu -8+32*4-128($ap),$TEMP1
1115 vpaddq $TEMP2,$ACC2,$ACC2
1116 vpmuludq $Bi,$TEMP0,$TEMP0
1117 vmovdqu -8+32*5-128($ap),$TEMP2
1118 vpaddq $TEMP0,$ACC3,$ACC3
1119 vpmuludq $Bi,$TEMP1,$TEMP1
1120 vmovdqu -8+32*6-128($ap),$TEMP0
1121 vpaddq $TEMP1,$ACC4,$ACC4
1122 vpmuludq $Bi,$TEMP2,$TEMP2
1123 vmovdqu -8+32*7-128($ap),$TEMP1
1124 vpaddq $TEMP2,$ACC5,$ACC5
1125 vpmuludq $Bi,$TEMP0,$TEMP0
1126 vmovdqu -8+32*8-128($ap),$TEMP2
1127 vpaddq $TEMP0,$ACC6,$ACC6
1128 vpmuludq $Bi,$TEMP1,$TEMP1
1129 vmovdqu -8+32*9-128($ap),$ACC9
1130 vpaddq $TEMP1,$ACC7,$ACC7
1131 vpmuludq $Bi,$TEMP2,$TEMP2
1132 vpaddq $TEMP2,$ACC8,$ACC8
1133 vpmuludq $Bi,$ACC9,$ACC9
1134 vpbroadcastq 16($bp), $Bi
1135
1136 mov %rax,%rdx
1137 imulq -128($np),%rax
1138 add %rax,$r1
1139 vmovdqu -8+32*1-128($np),$TEMP0
1140 mov %rdx,%rax
1141 imulq 8-128($np),%rax
1142 add %rax,$r2
1143 vmovdqu -8+32*2-128($np),$TEMP1
1144 shr \$29, $r1
1145 imulq 16-128($np),%rdx
1146 add %rdx,$r3
1147 add $r1, $r2
1148
1149 vpmuludq $Yi,$TEMP0,$TEMP0
1150 vmovq $Bi, %rbx
1151 vmovdqu -8+32*3-128($np),$TEMP2
1152 vpaddq $TEMP0,$ACC1,$ACC1
1153 vpmuludq $Yi,$TEMP1,$TEMP1
1154 vmovdqu -8+32*4-128($np),$TEMP0
1155 vpaddq $TEMP1,$ACC2,$ACC2
1156 vpmuludq $Yi,$TEMP2,$TEMP2
1157 vmovdqu -8+32*5-128($np),$TEMP1
1158 vpaddq $TEMP2,$ACC3,$ACC3
1159 vpmuludq $Yi,$TEMP0,$TEMP0
1160 vmovdqu -8+32*6-128($np),$TEMP2
1161 vpaddq $TEMP0,$ACC4,$ACC4
1162 vpmuludq $Yi,$TEMP1,$TEMP1
1163 vmovdqu -8+32*7-128($np),$TEMP0
1164 vpaddq $TEMP1,$ACC5,$ACC5
1165 vpmuludq $Yi,$TEMP2,$TEMP2
1166 vmovdqu -8+32*8-128($np),$TEMP1
1167 vpaddq $TEMP2,$ACC6,$ACC6
1168 vpmuludq $Yi,$TEMP0,$TEMP0
1169 vmovdqu -8+32*9-128($np),$TEMP2
1170 vpaddq $TEMP0,$ACC7,$ACC7
1171 vpmuludq $Yi,$TEMP1,$TEMP1
1172 vpaddq $TEMP1,$ACC8,$ACC8
1173 vpmuludq $Yi,$TEMP2,$TEMP2
1174 vpaddq $TEMP2,$ACC9,$ACC9
1175
1176 vmovdqu -16+32*1-128($ap),$TEMP0
1177 mov %rbx,%rax
1178 imulq -128($ap),%rax
1179 add $r2,%rax
1180
1181 vmovdqu -16+32*2-128($ap),$TEMP1
1182 mov %rax,$r2
1183 imull $n0, %eax
1184 and \$0x1fffffff, %eax
1185
1186 imulq 8-128($ap),%rbx
1187 add %rbx,$r3
1188 vpmuludq $Bi,$TEMP0,$TEMP0
1189 vmovd %eax, $Yi
1190 vmovdqu -16+32*3-128($ap),$TEMP2
1191 vpaddq $TEMP0,$ACC1,$ACC1
1192 vpmuludq $Bi,$TEMP1,$TEMP1
1193 vpbroadcastq $Yi, $Yi
1194 vmovdqu -16+32*4-128($ap),$TEMP0
1195 vpaddq $TEMP1,$ACC2,$ACC2
1196 vpmuludq $Bi,$TEMP2,$TEMP2
1197 vmovdqu -16+32*5-128($ap),$TEMP1
1198 vpaddq $TEMP2,$ACC3,$ACC3
1199 vpmuludq $Bi,$TEMP0,$TEMP0
1200 vmovdqu -16+32*6-128($ap),$TEMP2
1201 vpaddq $TEMP0,$ACC4,$ACC4
1202 vpmuludq $Bi,$TEMP1,$TEMP1
1203 vmovdqu -16+32*7-128($ap),$TEMP0
1204 vpaddq $TEMP1,$ACC5,$ACC5
1205 vpmuludq $Bi,$TEMP2,$TEMP2
1206 vmovdqu -16+32*8-128($ap),$TEMP1
1207 vpaddq $TEMP2,$ACC6,$ACC6
1208 vpmuludq $Bi,$TEMP0,$TEMP0
1209 vmovdqu -16+32*9-128($ap),$TEMP2
1210 vpaddq $TEMP0,$ACC7,$ACC7
1211 vpmuludq $Bi,$TEMP1,$TEMP1
1212 vpaddq $TEMP1,$ACC8,$ACC8
1213 vpmuludq $Bi,$TEMP2,$TEMP2
1214 vpbroadcastq 24($bp), $Bi
1215 vpaddq $TEMP2,$ACC9,$ACC9
1216
1217 vmovdqu -16+32*1-128($np),$TEMP0
1218 mov %rax,%rdx
1219 imulq -128($np),%rax
1220 add %rax,$r2
1221 vmovdqu -16+32*2-128($np),$TEMP1
1222 imulq 8-128($np),%rdx
1223 add %rdx,$r3
1224 shr \$29, $r2
1225
1226 vpmuludq $Yi,$TEMP0,$TEMP0
1227 vmovq $Bi, %rbx
1228 vmovdqu -16+32*3-128($np),$TEMP2
1229 vpaddq $TEMP0,$ACC1,$ACC1
1230 vpmuludq $Yi,$TEMP1,$TEMP1
1231 vmovdqu -16+32*4-128($np),$TEMP0
1232 vpaddq $TEMP1,$ACC2,$ACC2
1233 vpmuludq $Yi,$TEMP2,$TEMP2
1234 vmovdqu -16+32*5-128($np),$TEMP1
1235 vpaddq $TEMP2,$ACC3,$ACC3
1236 vpmuludq $Yi,$TEMP0,$TEMP0
1237 vmovdqu -16+32*6-128($np),$TEMP2
1238 vpaddq $TEMP0,$ACC4,$ACC4
1239 vpmuludq $Yi,$TEMP1,$TEMP1
1240 vmovdqu -16+32*7-128($np),$TEMP0
1241 vpaddq $TEMP1,$ACC5,$ACC5
1242 vpmuludq $Yi,$TEMP2,$TEMP2
1243 vmovdqu -16+32*8-128($np),$TEMP1
1244 vpaddq $TEMP2,$ACC6,$ACC6
1245 vpmuludq $Yi,$TEMP0,$TEMP0
1246 vmovdqu -16+32*9-128($np),$TEMP2
1247 vpaddq $TEMP0,$ACC7,$ACC7
1248 vpmuludq $Yi,$TEMP1,$TEMP1
1249 vmovdqu -24+32*1-128($ap),$TEMP0
1250 vpaddq $TEMP1,$ACC8,$ACC8
1251 vpmuludq $Yi,$TEMP2,$TEMP2
1252 vmovdqu -24+32*2-128($ap),$TEMP1
1253 vpaddq $TEMP2,$ACC9,$ACC9
1254
1255 add $r2, $r3
1256 imulq -128($ap),%rbx
1257 add %rbx,$r3
1258
1259 mov $r3, %rax
1260 imull $n0, %eax
1261 and \$0x1fffffff, %eax
1262
1263 vpmuludq $Bi,$TEMP0,$TEMP0
1264 vmovd %eax, $Yi
1265 vmovdqu -24+32*3-128($ap),$TEMP2
1266 vpaddq $TEMP0,$ACC1,$ACC1
1267 vpmuludq $Bi,$TEMP1,$TEMP1
1268 vpbroadcastq $Yi, $Yi
1269 vmovdqu -24+32*4-128($ap),$TEMP0
1270 vpaddq $TEMP1,$ACC2,$ACC2
1271 vpmuludq $Bi,$TEMP2,$TEMP2
1272 vmovdqu -24+32*5-128($ap),$TEMP1
1273 vpaddq $TEMP2,$ACC3,$ACC3
1274 vpmuludq $Bi,$TEMP0,$TEMP0
1275 vmovdqu -24+32*6-128($ap),$TEMP2
1276 vpaddq $TEMP0,$ACC4,$ACC4
1277 vpmuludq $Bi,$TEMP1,$TEMP1
1278 vmovdqu -24+32*7-128($ap),$TEMP0
1279 vpaddq $TEMP1,$ACC5,$ACC5
1280 vpmuludq $Bi,$TEMP2,$TEMP2
1281 vmovdqu -24+32*8-128($ap),$TEMP1
1282 vpaddq $TEMP2,$ACC6,$ACC6
1283 vpmuludq $Bi,$TEMP0,$TEMP0
1284 vmovdqu -24+32*9-128($ap),$TEMP2
1285 vpaddq $TEMP0,$ACC7,$ACC7
1286 vpmuludq $Bi,$TEMP1,$TEMP1
1287 vpaddq $TEMP1,$ACC8,$ACC8
1288 vpmuludq $Bi,$TEMP2,$TEMP2
1289 vpbroadcastq 32($bp), $Bi
1290 vpaddq $TEMP2,$ACC9,$ACC9
1291 add \$32, $bp # $bp++
1292
1293 vmovdqu -24+32*1-128($np),$TEMP0
1294 imulq -128($np),%rax
1295 add %rax,$r3
1296 shr \$29, $r3
1297
1298 vmovdqu -24+32*2-128($np),$TEMP1
1299 vpmuludq $Yi,$TEMP0,$TEMP0
1300 vmovq $Bi, %rbx
1301 vmovdqu -24+32*3-128($np),$TEMP2
1302 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1303 vpmuludq $Yi,$TEMP1,$TEMP1
1304 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1305 vpaddq $TEMP1,$ACC2,$ACC1
1306 vmovdqu -24+32*4-128($np),$TEMP0
1307 vpmuludq $Yi,$TEMP2,$TEMP2
1308 vmovdqu -24+32*5-128($np),$TEMP1
1309 vpaddq $TEMP2,$ACC3,$ACC2
1310 vpmuludq $Yi,$TEMP0,$TEMP0
1311 vmovdqu -24+32*6-128($np),$TEMP2
1312 vpaddq $TEMP0,$ACC4,$ACC3
1313 vpmuludq $Yi,$TEMP1,$TEMP1
1314 vmovdqu -24+32*7-128($np),$TEMP0
1315 vpaddq $TEMP1,$ACC5,$ACC4
1316 vpmuludq $Yi,$TEMP2,$TEMP2
1317 vmovdqu -24+32*8-128($np),$TEMP1
1318 vpaddq $TEMP2,$ACC6,$ACC5
1319 vpmuludq $Yi,$TEMP0,$TEMP0
1320 vmovdqu -24+32*9-128($np),$TEMP2
1321 mov $r3, $r0
1322 vpaddq $TEMP0,$ACC7,$ACC6
1323 vpmuludq $Yi,$TEMP1,$TEMP1
1324 add (%rsp), $r0
1325 vpaddq $TEMP1,$ACC8,$ACC7
1326 vpmuludq $Yi,$TEMP2,$TEMP2
1327 vmovq $r3, $TEMP1
1328 vpaddq $TEMP2,$ACC9,$ACC8
1329
1330 dec $i
1331 jnz .Loop_mul_1024
1332___
1333
1334# (*) Original implementation was correcting ACC1-ACC3 for overflow
1335# after 7 loop runs, or after 28 iterations, or 56 additions.
1336# But as we underutilize resources, it's possible to correct in
1337# each iteration with marginal performance loss. But then, as
1338# we do it in each iteration, we can correct less digits, and
Robert Sloancd79cde2017-12-11 09:06:12 -08001339# avoid performance penalties completely.
Adam Langleyd9e397b2015-01-22 14:27:53 -08001340
1341$TEMP0 = $ACC9;
1342$TEMP3 = $Bi;
1343$TEMP4 = $Yi;
1344$code.=<<___;
Adam Langleyd9e397b2015-01-22 14:27:53 -08001345 vpaddq (%rsp), $TEMP1, $ACC0
1346
1347 vpsrlq \$29, $ACC0, $TEMP1
1348 vpand $AND_MASK, $ACC0, $ACC0
1349 vpsrlq \$29, $ACC1, $TEMP2
1350 vpand $AND_MASK, $ACC1, $ACC1
1351 vpsrlq \$29, $ACC2, $TEMP3
1352 vpermq \$0x93, $TEMP1, $TEMP1
1353 vpand $AND_MASK, $ACC2, $ACC2
1354 vpsrlq \$29, $ACC3, $TEMP4
1355 vpermq \$0x93, $TEMP2, $TEMP2
1356 vpand $AND_MASK, $ACC3, $ACC3
1357
1358 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1359 vpermq \$0x93, $TEMP3, $TEMP3
1360 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1361 vpermq \$0x93, $TEMP4, $TEMP4
1362 vpaddq $TEMP0, $ACC0, $ACC0
1363 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1364 vpaddq $TEMP1, $ACC1, $ACC1
1365 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1366 vpaddq $TEMP2, $ACC2, $ACC2
1367 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1368 vpaddq $TEMP3, $ACC3, $ACC3
1369 vpaddq $TEMP4, $ACC4, $ACC4
1370
1371 vpsrlq \$29, $ACC0, $TEMP1
1372 vpand $AND_MASK, $ACC0, $ACC0
1373 vpsrlq \$29, $ACC1, $TEMP2
1374 vpand $AND_MASK, $ACC1, $ACC1
1375 vpsrlq \$29, $ACC2, $TEMP3
1376 vpermq \$0x93, $TEMP1, $TEMP1
1377 vpand $AND_MASK, $ACC2, $ACC2
1378 vpsrlq \$29, $ACC3, $TEMP4
1379 vpermq \$0x93, $TEMP2, $TEMP2
1380 vpand $AND_MASK, $ACC3, $ACC3
1381 vpermq \$0x93, $TEMP3, $TEMP3
1382
1383 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1384 vpermq \$0x93, $TEMP4, $TEMP4
1385 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1386 vpaddq $TEMP0, $ACC0, $ACC0
1387 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1388 vpaddq $TEMP1, $ACC1, $ACC1
1389 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1390 vpaddq $TEMP2, $ACC2, $ACC2
1391 vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1392 vpaddq $TEMP3, $ACC3, $ACC3
1393 vpaddq $TEMP4, $ACC4, $ACC4
1394
1395 vmovdqu $ACC0, 0-128($rp)
1396 vmovdqu $ACC1, 32-128($rp)
1397 vmovdqu $ACC2, 64-128($rp)
1398 vmovdqu $ACC3, 96-128($rp)
1399___
1400
1401$TEMP5=$ACC0;
1402$code.=<<___;
1403 vpsrlq \$29, $ACC4, $TEMP1
1404 vpand $AND_MASK, $ACC4, $ACC4
1405 vpsrlq \$29, $ACC5, $TEMP2
1406 vpand $AND_MASK, $ACC5, $ACC5
1407 vpsrlq \$29, $ACC6, $TEMP3
1408 vpermq \$0x93, $TEMP1, $TEMP1
1409 vpand $AND_MASK, $ACC6, $ACC6
1410 vpsrlq \$29, $ACC7, $TEMP4
1411 vpermq \$0x93, $TEMP2, $TEMP2
1412 vpand $AND_MASK, $ACC7, $ACC7
1413 vpsrlq \$29, $ACC8, $TEMP5
1414 vpermq \$0x93, $TEMP3, $TEMP3
1415 vpand $AND_MASK, $ACC8, $ACC8
1416 vpermq \$0x93, $TEMP4, $TEMP4
1417
1418 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1419 vpermq \$0x93, $TEMP5, $TEMP5
1420 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1421 vpaddq $TEMP0, $ACC4, $ACC4
1422 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1423 vpaddq $TEMP1, $ACC5, $ACC5
1424 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1425 vpaddq $TEMP2, $ACC6, $ACC6
1426 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1427 vpaddq $TEMP3, $ACC7, $ACC7
1428 vpaddq $TEMP4, $ACC8, $ACC8
1429
1430 vpsrlq \$29, $ACC4, $TEMP1
1431 vpand $AND_MASK, $ACC4, $ACC4
1432 vpsrlq \$29, $ACC5, $TEMP2
1433 vpand $AND_MASK, $ACC5, $ACC5
1434 vpsrlq \$29, $ACC6, $TEMP3
1435 vpermq \$0x93, $TEMP1, $TEMP1
1436 vpand $AND_MASK, $ACC6, $ACC6
1437 vpsrlq \$29, $ACC7, $TEMP4
1438 vpermq \$0x93, $TEMP2, $TEMP2
1439 vpand $AND_MASK, $ACC7, $ACC7
1440 vpsrlq \$29, $ACC8, $TEMP5
1441 vpermq \$0x93, $TEMP3, $TEMP3
1442 vpand $AND_MASK, $ACC8, $ACC8
1443 vpermq \$0x93, $TEMP4, $TEMP4
1444
1445 vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1446 vpermq \$0x93, $TEMP5, $TEMP5
1447 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1448 vpaddq $TEMP0, $ACC4, $ACC4
1449 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1450 vpaddq $TEMP1, $ACC5, $ACC5
1451 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1452 vpaddq $TEMP2, $ACC6, $ACC6
1453 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1454 vpaddq $TEMP3, $ACC7, $ACC7
1455 vpaddq $TEMP4, $ACC8, $ACC8
1456
1457 vmovdqu $ACC4, 128-128($rp)
Robert Sloana94fe052017-02-21 08:49:28 -08001458 vmovdqu $ACC5, 160-128($rp)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001459 vmovdqu $ACC6, 192-128($rp)
1460 vmovdqu $ACC7, 224-128($rp)
1461 vmovdqu $ACC8, 256-128($rp)
1462 vzeroupper
1463
1464 mov %rbp, %rax
Robert Sloana94fe052017-02-21 08:49:28 -08001465.cfi_def_cfa_register %rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001466___
1467$code.=<<___ if ($win64);
Robert Sloan5d625782017-02-13 09:55:39 -08001468.Lmul_1024_in_tail:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001469 movaps -0xd8(%rax),%xmm6
1470 movaps -0xc8(%rax),%xmm7
1471 movaps -0xb8(%rax),%xmm8
1472 movaps -0xa8(%rax),%xmm9
1473 movaps -0x98(%rax),%xmm10
1474 movaps -0x88(%rax),%xmm11
1475 movaps -0x78(%rax),%xmm12
1476 movaps -0x68(%rax),%xmm13
1477 movaps -0x58(%rax),%xmm14
1478 movaps -0x48(%rax),%xmm15
1479___
1480$code.=<<___;
1481 mov -48(%rax),%r15
Robert Sloana94fe052017-02-21 08:49:28 -08001482.cfi_restore %r15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001483 mov -40(%rax),%r14
Robert Sloana94fe052017-02-21 08:49:28 -08001484.cfi_restore %r14
Adam Langleyd9e397b2015-01-22 14:27:53 -08001485 mov -32(%rax),%r13
Robert Sloana94fe052017-02-21 08:49:28 -08001486.cfi_restore %r13
Adam Langleyd9e397b2015-01-22 14:27:53 -08001487 mov -24(%rax),%r12
Robert Sloana94fe052017-02-21 08:49:28 -08001488.cfi_restore %r12
Adam Langleyd9e397b2015-01-22 14:27:53 -08001489 mov -16(%rax),%rbp
Robert Sloana94fe052017-02-21 08:49:28 -08001490.cfi_restore %rbp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001491 mov -8(%rax),%rbx
Robert Sloana94fe052017-02-21 08:49:28 -08001492.cfi_restore %rbx
Adam Langleyd9e397b2015-01-22 14:27:53 -08001493 lea (%rax),%rsp # restore %rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001494.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001495.Lmul_1024_epilogue:
1496 ret
Robert Sloana94fe052017-02-21 08:49:28 -08001497.cfi_endproc
Adam Langleyd9e397b2015-01-22 14:27:53 -08001498.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1499___
1500}
1501{
1502my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1503my @T = map("%r$_",(8..11));
1504
1505$code.=<<___;
1506.globl rsaz_1024_red2norm_avx2
1507.type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1508.align 32
1509rsaz_1024_red2norm_avx2:
1510 sub \$-128,$inp # size optimization
1511 xor %rax,%rax
1512___
1513
1514for ($j=0,$i=0; $i<16; $i++) {
1515 my $k=0;
1516 while (29*$j<64*($i+1)) { # load data till boundary
1517 $code.=" mov `8*$j-128`($inp), @T[0]\n";
1518 $j++; $k++; push(@T,shift(@T));
1519 }
1520 $l=$k;
1521 while ($k>1) { # shift loaded data but last value
1522 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1523 $k--;
1524 }
1525 $code.=<<___; # shift last value
1526 mov @T[-1], @T[0]
1527 shl \$`29*($j-1)`, @T[-1]
1528 shr \$`-29*($j-1)`, @T[0]
1529___
1530 while ($l) { # accumulate all values
1531 $code.=" add @T[-$l], %rax\n";
1532 $l--;
1533 }
1534 $code.=<<___;
1535 adc \$0, @T[0] # consume eventual carry
1536 mov %rax, 8*$i($out)
1537 mov @T[0], %rax
1538___
1539 push(@T,shift(@T));
1540}
1541$code.=<<___;
1542 ret
1543.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1544
1545.globl rsaz_1024_norm2red_avx2
1546.type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1547.align 32
1548rsaz_1024_norm2red_avx2:
1549 sub \$-128,$out # size optimization
1550 mov ($inp),@T[0]
1551 mov \$0x1fffffff,%eax
1552___
1553for ($j=0,$i=0; $i<16; $i++) {
1554 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1555 $code.=" xor @T[1],@T[1]\n" if ($i==15);
1556 my $k=1;
1557 while (29*($j+1)<64*($i+1)) {
1558 $code.=<<___;
1559 mov @T[0],@T[-$k]
1560 shr \$`29*$j`,@T[-$k]
1561 and %rax,@T[-$k] # &0x1fffffff
1562 mov @T[-$k],`8*$j-128`($out)
1563___
1564 $j++; $k++;
1565 }
1566 $code.=<<___;
1567 shrd \$`29*$j`,@T[1],@T[0]
1568 and %rax,@T[0]
1569 mov @T[0],`8*$j-128`($out)
1570___
1571 $j++;
1572 push(@T,shift(@T));
1573}
1574$code.=<<___;
1575 mov @T[0],`8*$j-128`($out) # zero
1576 mov @T[0],`8*($j+1)-128`($out)
1577 mov @T[0],`8*($j+2)-128`($out)
1578 mov @T[0],`8*($j+3)-128`($out)
1579 ret
1580.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1581___
1582}
1583{
1584my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1585
1586$code.=<<___;
1587.globl rsaz_1024_scatter5_avx2
1588.type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1589.align 32
1590rsaz_1024_scatter5_avx2:
1591 vzeroupper
1592 vmovdqu .Lscatter_permd(%rip),%ymm5
1593 shl \$4,$power
1594 lea ($out,$power),$out
1595 mov \$9,%eax
1596 jmp .Loop_scatter_1024
1597
1598.align 32
1599.Loop_scatter_1024:
1600 vmovdqu ($inp),%ymm0
1601 lea 32($inp),$inp
1602 vpermd %ymm0,%ymm5,%ymm0
1603 vmovdqu %xmm0,($out)
1604 lea 16*32($out),$out
1605 dec %eax
1606 jnz .Loop_scatter_1024
1607
1608 vzeroupper
1609 ret
1610.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1611
1612.globl rsaz_1024_gather5_avx2
1613.type rsaz_1024_gather5_avx2,\@abi-omnipotent
1614.align 32
1615rsaz_1024_gather5_avx2:
Robert Sloana94fe052017-02-21 08:49:28 -08001616.cfi_startproc
David Benjamin4969cc92016-04-22 15:02:23 -04001617 vzeroupper
1618 mov %rsp,%r11
Robert Sloana94fe052017-02-21 08:49:28 -08001619.cfi_def_cfa_register %r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001620___
1621$code.=<<___ if ($win64);
1622 lea -0x88(%rsp),%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001623.LSEH_begin_rsaz_1024_gather5:
1624 # I can't trust assembler to use specific encoding:-(
David Benjamin4969cc92016-04-22 15:02:23 -04001625 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
1626 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
1627 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
1628 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
1629 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
1630 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
1631 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
1632 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
1633 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
1634 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
1635 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001636___
1637$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04001638 lea -0x100(%rsp),%rsp
1639 and \$-32, %rsp
1640 lea .Linc(%rip), %r10
1641 lea -128(%rsp),%rax # control u-op density
Adam Langleyd9e397b2015-01-22 14:27:53 -08001642
David Benjamin4969cc92016-04-22 15:02:23 -04001643 vmovd $power, %xmm4
1644 vmovdqa (%r10),%ymm0
1645 vmovdqa 32(%r10),%ymm1
1646 vmovdqa 64(%r10),%ymm5
1647 vpbroadcastd %xmm4,%ymm4
Adam Langleyd9e397b2015-01-22 14:27:53 -08001648
David Benjamin4969cc92016-04-22 15:02:23 -04001649 vpaddd %ymm5, %ymm0, %ymm2
1650 vpcmpeqd %ymm4, %ymm0, %ymm0
1651 vpaddd %ymm5, %ymm1, %ymm3
1652 vpcmpeqd %ymm4, %ymm1, %ymm1
1653 vmovdqa %ymm0, 32*0+128(%rax)
1654 vpaddd %ymm5, %ymm2, %ymm0
1655 vpcmpeqd %ymm4, %ymm2, %ymm2
1656 vmovdqa %ymm1, 32*1+128(%rax)
1657 vpaddd %ymm5, %ymm3, %ymm1
1658 vpcmpeqd %ymm4, %ymm3, %ymm3
1659 vmovdqa %ymm2, 32*2+128(%rax)
1660 vpaddd %ymm5, %ymm0, %ymm2
1661 vpcmpeqd %ymm4, %ymm0, %ymm0
1662 vmovdqa %ymm3, 32*3+128(%rax)
1663 vpaddd %ymm5, %ymm1, %ymm3
1664 vpcmpeqd %ymm4, %ymm1, %ymm1
1665 vmovdqa %ymm0, 32*4+128(%rax)
1666 vpaddd %ymm5, %ymm2, %ymm8
1667 vpcmpeqd %ymm4, %ymm2, %ymm2
1668 vmovdqa %ymm1, 32*5+128(%rax)
1669 vpaddd %ymm5, %ymm3, %ymm9
1670 vpcmpeqd %ymm4, %ymm3, %ymm3
1671 vmovdqa %ymm2, 32*6+128(%rax)
1672 vpaddd %ymm5, %ymm8, %ymm10
1673 vpcmpeqd %ymm4, %ymm8, %ymm8
1674 vmovdqa %ymm3, 32*7+128(%rax)
1675 vpaddd %ymm5, %ymm9, %ymm11
1676 vpcmpeqd %ymm4, %ymm9, %ymm9
1677 vpaddd %ymm5, %ymm10, %ymm12
1678 vpcmpeqd %ymm4, %ymm10, %ymm10
1679 vpaddd %ymm5, %ymm11, %ymm13
1680 vpcmpeqd %ymm4, %ymm11, %ymm11
1681 vpaddd %ymm5, %ymm12, %ymm14
1682 vpcmpeqd %ymm4, %ymm12, %ymm12
1683 vpaddd %ymm5, %ymm13, %ymm15
1684 vpcmpeqd %ymm4, %ymm13, %ymm13
1685 vpcmpeqd %ymm4, %ymm14, %ymm14
1686 vpcmpeqd %ymm4, %ymm15, %ymm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001687
David Benjamin4969cc92016-04-22 15:02:23 -04001688 vmovdqa -32(%r10),%ymm7 # .Lgather_permd
1689 lea 128($inp), $inp
1690 mov \$9,$power
1691
Adam Langleyd9e397b2015-01-22 14:27:53 -08001692.Loop_gather_1024:
David Benjamin4969cc92016-04-22 15:02:23 -04001693 vmovdqa 32*0-128($inp), %ymm0
1694 vmovdqa 32*1-128($inp), %ymm1
1695 vmovdqa 32*2-128($inp), %ymm2
1696 vmovdqa 32*3-128($inp), %ymm3
1697 vpand 32*0+128(%rax), %ymm0, %ymm0
1698 vpand 32*1+128(%rax), %ymm1, %ymm1
1699 vpand 32*2+128(%rax), %ymm2, %ymm2
1700 vpor %ymm0, %ymm1, %ymm4
1701 vpand 32*3+128(%rax), %ymm3, %ymm3
1702 vmovdqa 32*4-128($inp), %ymm0
1703 vmovdqa 32*5-128($inp), %ymm1
1704 vpor %ymm2, %ymm3, %ymm5
1705 vmovdqa 32*6-128($inp), %ymm2
1706 vmovdqa 32*7-128($inp), %ymm3
1707 vpand 32*4+128(%rax), %ymm0, %ymm0
1708 vpand 32*5+128(%rax), %ymm1, %ymm1
1709 vpand 32*6+128(%rax), %ymm2, %ymm2
1710 vpor %ymm0, %ymm4, %ymm4
1711 vpand 32*7+128(%rax), %ymm3, %ymm3
1712 vpand 32*8-128($inp), %ymm8, %ymm0
1713 vpor %ymm1, %ymm5, %ymm5
1714 vpand 32*9-128($inp), %ymm9, %ymm1
1715 vpor %ymm2, %ymm4, %ymm4
1716 vpand 32*10-128($inp),%ymm10, %ymm2
1717 vpor %ymm3, %ymm5, %ymm5
1718 vpand 32*11-128($inp),%ymm11, %ymm3
1719 vpor %ymm0, %ymm4, %ymm4
1720 vpand 32*12-128($inp),%ymm12, %ymm0
1721 vpor %ymm1, %ymm5, %ymm5
1722 vpand 32*13-128($inp),%ymm13, %ymm1
1723 vpor %ymm2, %ymm4, %ymm4
1724 vpand 32*14-128($inp),%ymm14, %ymm2
1725 vpor %ymm3, %ymm5, %ymm5
1726 vpand 32*15-128($inp),%ymm15, %ymm3
1727 lea 32*16($inp), $inp
1728 vpor %ymm0, %ymm4, %ymm4
1729 vpor %ymm1, %ymm5, %ymm5
1730 vpor %ymm2, %ymm4, %ymm4
1731 vpor %ymm3, %ymm5, %ymm5
1732
1733 vpor %ymm5, %ymm4, %ymm4
1734 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
1735 vpor %xmm4, %xmm5, %xmm5
1736 vpermd %ymm5,%ymm7,%ymm5
1737 vmovdqu %ymm5,($out)
Adam Langleyd9e397b2015-01-22 14:27:53 -08001738 lea 32($out),$out
David Benjamin4969cc92016-04-22 15:02:23 -04001739 dec $power
Adam Langleyd9e397b2015-01-22 14:27:53 -08001740 jnz .Loop_gather_1024
1741
1742 vpxor %ymm0,%ymm0,%ymm0
1743 vmovdqu %ymm0,($out)
1744 vzeroupper
1745___
1746$code.=<<___ if ($win64);
David Benjamin4969cc92016-04-22 15:02:23 -04001747 movaps -0xa8(%r11),%xmm6
1748 movaps -0x98(%r11),%xmm7
1749 movaps -0x88(%r11),%xmm8
1750 movaps -0x78(%r11),%xmm9
1751 movaps -0x68(%r11),%xmm10
1752 movaps -0x58(%r11),%xmm11
1753 movaps -0x48(%r11),%xmm12
1754 movaps -0x38(%r11),%xmm13
1755 movaps -0x28(%r11),%xmm14
1756 movaps -0x18(%r11),%xmm15
Adam Langleyd9e397b2015-01-22 14:27:53 -08001757___
1758$code.=<<___;
David Benjamin4969cc92016-04-22 15:02:23 -04001759 lea (%r11),%rsp
Robert Sloana94fe052017-02-21 08:49:28 -08001760.cfi_def_cfa_register %rsp
Adam Langleyd9e397b2015-01-22 14:27:53 -08001761 ret
Robert Sloana94fe052017-02-21 08:49:28 -08001762.cfi_endproc
1763.LSEH_end_rsaz_1024_gather5:
Adam Langleyd9e397b2015-01-22 14:27:53 -08001764.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1765___
1766}
1767
1768$code.=<<___;
1769.extern OPENSSL_ia32cap_P
1770.globl rsaz_avx2_eligible
1771.type rsaz_avx2_eligible,\@abi-omnipotent
1772.align 32
1773rsaz_avx2_eligible:
Robert Sloan8ff03552017-06-14 12:40:58 -07001774 leaq OPENSSL_ia32cap_P(%rip),%rax
1775 mov 8(%rax),%eax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001776___
1777$code.=<<___ if ($addx);
1778 mov \$`1<<8|1<<19`,%ecx
1779 mov \$0,%edx
1780 and %eax,%ecx
1781 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
1782 cmove %edx,%eax
1783___
1784$code.=<<___;
1785 and \$`1<<5`,%eax
1786 shr \$5,%eax
1787 ret
1788.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1789
1790.align 64
1791.Land_mask:
Robert Sloancd79cde2017-12-11 09:06:12 -08001792 .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
Adam Langleyd9e397b2015-01-22 14:27:53 -08001793.Lscatter_permd:
1794 .long 0,2,4,6,7,7,7,7
1795.Lgather_permd:
1796 .long 0,7,1,7,2,7,3,7
David Benjamin4969cc92016-04-22 15:02:23 -04001797.Linc:
1798 .long 0,0,0,0, 1,1,1,1
1799 .long 2,2,2,2, 3,3,3,3
1800 .long 4,4,4,4, 4,4,4,4
Adam Langleyd9e397b2015-01-22 14:27:53 -08001801.align 64
1802___
1803
1804if ($win64) {
1805$rec="%rcx";
1806$frame="%rdx";
1807$context="%r8";
1808$disp="%r9";
1809
1810$code.=<<___
1811.extern __imp_RtlVirtualUnwind
1812.type rsaz_se_handler,\@abi-omnipotent
1813.align 16
1814rsaz_se_handler:
1815 push %rsi
1816 push %rdi
1817 push %rbx
1818 push %rbp
1819 push %r12
1820 push %r13
1821 push %r14
1822 push %r15
1823 pushfq
1824 sub \$64,%rsp
1825
1826 mov 120($context),%rax # pull context->Rax
1827 mov 248($context),%rbx # pull context->Rip
1828
1829 mov 8($disp),%rsi # disp->ImageBase
1830 mov 56($disp),%r11 # disp->HandlerData
1831
1832 mov 0(%r11),%r10d # HandlerData[0]
1833 lea (%rsi,%r10),%r10 # prologue label
1834 cmp %r10,%rbx # context->Rip<prologue label
1835 jb .Lcommon_seh_tail
1836
Adam Langleyd9e397b2015-01-22 14:27:53 -08001837 mov 4(%r11),%r10d # HandlerData[1]
1838 lea (%rsi,%r10),%r10 # epilogue label
1839 cmp %r10,%rbx # context->Rip>=epilogue label
1840 jae .Lcommon_seh_tail
1841
Robert Sloan5d625782017-02-13 09:55:39 -08001842 mov 160($context),%rbp # pull context->Rbp
1843
1844 mov 8(%r11),%r10d # HandlerData[2]
1845 lea (%rsi,%r10),%r10 # "in tail" label
1846 cmp %r10,%rbx # context->Rip>="in tail" label
1847 cmovc %rbp,%rax
Adam Langleyd9e397b2015-01-22 14:27:53 -08001848
1849 mov -48(%rax),%r15
1850 mov -40(%rax),%r14
1851 mov -32(%rax),%r13
1852 mov -24(%rax),%r12
1853 mov -16(%rax),%rbp
1854 mov -8(%rax),%rbx
1855 mov %r15,240($context)
1856 mov %r14,232($context)
1857 mov %r13,224($context)
1858 mov %r12,216($context)
1859 mov %rbp,160($context)
1860 mov %rbx,144($context)
1861
1862 lea -0xd8(%rax),%rsi # %xmm save area
1863 lea 512($context),%rdi # & context.Xmm6
1864 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1865 .long 0xa548f3fc # cld; rep movsq
1866
1867.Lcommon_seh_tail:
1868 mov 8(%rax),%rdi
1869 mov 16(%rax),%rsi
1870 mov %rax,152($context) # restore context->Rsp
1871 mov %rsi,168($context) # restore context->Rsi
1872 mov %rdi,176($context) # restore context->Rdi
1873
1874 mov 40($disp),%rdi # disp->ContextRecord
1875 mov $context,%rsi # context
1876 mov \$154,%ecx # sizeof(CONTEXT)
1877 .long 0xa548f3fc # cld; rep movsq
1878
1879 mov $disp,%rsi
1880 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1881 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1882 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1883 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1884 mov 40(%rsi),%r10 # disp->ContextRecord
1885 lea 56(%rsi),%r11 # &disp->HandlerData
1886 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1887 mov %r10,32(%rsp) # arg5
1888 mov %r11,40(%rsp) # arg6
1889 mov %r12,48(%rsp) # arg7
1890 mov %rcx,56(%rsp) # arg8, (NULL)
1891 call *__imp_RtlVirtualUnwind(%rip)
1892
1893 mov \$1,%eax # ExceptionContinueSearch
1894 add \$64,%rsp
1895 popfq
1896 pop %r15
1897 pop %r14
1898 pop %r13
1899 pop %r12
1900 pop %rbp
1901 pop %rbx
1902 pop %rdi
1903 pop %rsi
1904 ret
1905.size rsaz_se_handler,.-rsaz_se_handler
1906
1907.section .pdata
1908.align 4
1909 .rva .LSEH_begin_rsaz_1024_sqr_avx2
1910 .rva .LSEH_end_rsaz_1024_sqr_avx2
1911 .rva .LSEH_info_rsaz_1024_sqr_avx2
1912
1913 .rva .LSEH_begin_rsaz_1024_mul_avx2
1914 .rva .LSEH_end_rsaz_1024_mul_avx2
1915 .rva .LSEH_info_rsaz_1024_mul_avx2
1916
1917 .rva .LSEH_begin_rsaz_1024_gather5
1918 .rva .LSEH_end_rsaz_1024_gather5
1919 .rva .LSEH_info_rsaz_1024_gather5
1920.section .xdata
1921.align 8
1922.LSEH_info_rsaz_1024_sqr_avx2:
1923 .byte 9,0,0,0
1924 .rva rsaz_se_handler
Robert Sloan5d625782017-02-13 09:55:39 -08001925 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
1926 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08001927.LSEH_info_rsaz_1024_mul_avx2:
1928 .byte 9,0,0,0
1929 .rva rsaz_se_handler
Robert Sloan5d625782017-02-13 09:55:39 -08001930 .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
1931 .long 0
Adam Langleyd9e397b2015-01-22 14:27:53 -08001932.LSEH_info_rsaz_1024_gather5:
David Benjamin4969cc92016-04-22 15:02:23 -04001933 .byte 0x01,0x36,0x17,0x0b
1934 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
1935 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
1936 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
1937 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
1938 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
1939 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
1940 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
1941 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
1942 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
1943 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
1944 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
1945 .byte 0x00,0xb3,0x00,0x00 # set_frame r11
Adam Langleyd9e397b2015-01-22 14:27:53 -08001946___
1947}
1948
1949foreach (split("\n",$code)) {
1950 s/\`([^\`]*)\`/eval($1)/ge;
1951
1952 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1953
1954 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1955 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1956 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1957 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1958 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1959 print $_,"\n";
1960}
1961
1962}}} else {{{
1963print <<___; # assembler is too old
1964.text
1965
1966.globl rsaz_avx2_eligible
1967.type rsaz_avx2_eligible,\@abi-omnipotent
1968rsaz_avx2_eligible:
1969 xor %eax,%eax
1970 ret
1971.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1972
1973.globl rsaz_1024_sqr_avx2
1974.globl rsaz_1024_mul_avx2
1975.globl rsaz_1024_norm2red_avx2
1976.globl rsaz_1024_red2norm_avx2
1977.globl rsaz_1024_scatter5_avx2
1978.globl rsaz_1024_gather5_avx2
1979.type rsaz_1024_sqr_avx2,\@abi-omnipotent
1980rsaz_1024_sqr_avx2:
1981rsaz_1024_mul_avx2:
1982rsaz_1024_norm2red_avx2:
1983rsaz_1024_red2norm_avx2:
1984rsaz_1024_scatter5_avx2:
1985rsaz_1024_gather5_avx2:
1986 .byte 0x0f,0x0b # ud2
1987 ret
1988.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1989___
1990}}}
1991
1992close STDOUT;