blob: 2a202fcdc9be906f5a19316dde055ef725e2746b [file] [log] [blame]
Adam Langleyd9e397b2015-01-22 14:27:53 -08001#!/usr/bin/env perl
2
3$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
Robert Sloan8ff03552017-06-14 12:40:58 -07004push(@INC,"${dir}","${dir}../../../perlasm");
Adam Langleyd9e397b2015-01-22 14:27:53 -08005require "x86asm.pl";
6
David Benjaminc895d6b2016-08-11 13:26:41 -04007$output = pop;
8open STDOUT,">$output";
9
Robert Sloan8ff03552017-06-14 12:40:58 -070010&asm_init($ARGV[0]);
Adam Langleyd9e397b2015-01-22 14:27:53 -080011
12$sse2=0;
13for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
14
15&external_label("OPENSSL_ia32cap_P") if ($sse2);
16
17&bn_mul_add_words("bn_mul_add_words");
18&bn_mul_words("bn_mul_words");
19&bn_sqr_words("bn_sqr_words");
20&bn_div_words("bn_div_words");
21&bn_add_words("bn_add_words");
22&bn_sub_words("bn_sub_words");
23&bn_sub_part_words("bn_sub_part_words");
24
25&asm_finish();
26
David Benjaminc895d6b2016-08-11 13:26:41 -040027close STDOUT;
28
Adam Langleyd9e397b2015-01-22 14:27:53 -080029sub bn_mul_add_words
30 {
31 local($name)=@_;
32
33 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
34
35 $r="eax";
36 $a="edx";
37 $c="ecx";
38
39 if ($sse2) {
40 &picmeup("eax","OPENSSL_ia32cap_P");
41 &bt(&DWP(0,"eax"),26);
42 &jnc(&label("maw_non_sse2"));
43
44 &mov($r,&wparam(0));
45 &mov($a,&wparam(1));
46 &mov($c,&wparam(2));
47 &movd("mm0",&wparam(3)); # mm0 = w
48 &pxor("mm1","mm1"); # mm1 = carry_in
49 &jmp(&label("maw_sse2_entry"));
Robert Sloana94fe052017-02-21 08:49:28 -080050
Adam Langleyd9e397b2015-01-22 14:27:53 -080051 &set_label("maw_sse2_unrolled",16);
52 &movd("mm3",&DWP(0,$r,"",0)); # mm3 = r[0]
53 &paddq("mm1","mm3"); # mm1 = carry_in + r[0]
54 &movd("mm2",&DWP(0,$a,"",0)); # mm2 = a[0]
55 &pmuludq("mm2","mm0"); # mm2 = w*a[0]
56 &movd("mm4",&DWP(4,$a,"",0)); # mm4 = a[1]
57 &pmuludq("mm4","mm0"); # mm4 = w*a[1]
58 &movd("mm6",&DWP(8,$a,"",0)); # mm6 = a[2]
59 &pmuludq("mm6","mm0"); # mm6 = w*a[2]
60 &movd("mm7",&DWP(12,$a,"",0)); # mm7 = a[3]
61 &pmuludq("mm7","mm0"); # mm7 = w*a[3]
62 &paddq("mm1","mm2"); # mm1 = carry_in + r[0] + w*a[0]
63 &movd("mm3",&DWP(4,$r,"",0)); # mm3 = r[1]
64 &paddq("mm3","mm4"); # mm3 = r[1] + w*a[1]
65 &movd("mm5",&DWP(8,$r,"",0)); # mm5 = r[2]
66 &paddq("mm5","mm6"); # mm5 = r[2] + w*a[2]
67 &movd("mm4",&DWP(12,$r,"",0)); # mm4 = r[3]
68 &paddq("mm7","mm4"); # mm7 = r[3] + w*a[3]
69 &movd(&DWP(0,$r,"",0),"mm1");
70 &movd("mm2",&DWP(16,$a,"",0)); # mm2 = a[4]
71 &pmuludq("mm2","mm0"); # mm2 = w*a[4]
72 &psrlq("mm1",32); # mm1 = carry0
73 &movd("mm4",&DWP(20,$a,"",0)); # mm4 = a[5]
74 &pmuludq("mm4","mm0"); # mm4 = w*a[5]
75 &paddq("mm1","mm3"); # mm1 = carry0 + r[1] + w*a[1]
76 &movd("mm6",&DWP(24,$a,"",0)); # mm6 = a[6]
77 &pmuludq("mm6","mm0"); # mm6 = w*a[6]
78 &movd(&DWP(4,$r,"",0),"mm1");
79 &psrlq("mm1",32); # mm1 = carry1
80 &movd("mm3",&DWP(28,$a,"",0)); # mm3 = a[7]
81 &add($a,32);
82 &pmuludq("mm3","mm0"); # mm3 = w*a[7]
83 &paddq("mm1","mm5"); # mm1 = carry1 + r[2] + w*a[2]
84 &movd("mm5",&DWP(16,$r,"",0)); # mm5 = r[4]
85 &paddq("mm2","mm5"); # mm2 = r[4] + w*a[4]
86 &movd(&DWP(8,$r,"",0),"mm1");
87 &psrlq("mm1",32); # mm1 = carry2
88 &paddq("mm1","mm7"); # mm1 = carry2 + r[3] + w*a[3]
89 &movd("mm5",&DWP(20,$r,"",0)); # mm5 = r[5]
90 &paddq("mm4","mm5"); # mm4 = r[5] + w*a[5]
91 &movd(&DWP(12,$r,"",0),"mm1");
92 &psrlq("mm1",32); # mm1 = carry3
93 &paddq("mm1","mm2"); # mm1 = carry3 + r[4] + w*a[4]
94 &movd("mm5",&DWP(24,$r,"",0)); # mm5 = r[6]
95 &paddq("mm6","mm5"); # mm6 = r[6] + w*a[6]
96 &movd(&DWP(16,$r,"",0),"mm1");
97 &psrlq("mm1",32); # mm1 = carry4
98 &paddq("mm1","mm4"); # mm1 = carry4 + r[5] + w*a[5]
99 &movd("mm5",&DWP(28,$r,"",0)); # mm5 = r[7]
100 &paddq("mm3","mm5"); # mm3 = r[7] + w*a[7]
101 &movd(&DWP(20,$r,"",0),"mm1");
102 &psrlq("mm1",32); # mm1 = carry5
103 &paddq("mm1","mm6"); # mm1 = carry5 + r[6] + w*a[6]
104 &movd(&DWP(24,$r,"",0),"mm1");
105 &psrlq("mm1",32); # mm1 = carry6
106 &paddq("mm1","mm3"); # mm1 = carry6 + r[7] + w*a[7]
107 &movd(&DWP(28,$r,"",0),"mm1");
108 &lea($r,&DWP(32,$r));
109 &psrlq("mm1",32); # mm1 = carry_out
110
111 &sub($c,8);
112 &jz(&label("maw_sse2_exit"));
113 &set_label("maw_sse2_entry");
114 &test($c,0xfffffff8);
115 &jnz(&label("maw_sse2_unrolled"));
116
117 &set_label("maw_sse2_loop",4);
118 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
119 &movd("mm3",&DWP(0,$r)); # mm3 = r[i]
120 &pmuludq("mm2","mm0"); # a[i] *= w
121 &lea($a,&DWP(4,$a));
122 &paddq("mm1","mm3"); # carry += r[i]
123 &paddq("mm1","mm2"); # carry += a[i]*w
124 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
125 &sub($c,1);
126 &psrlq("mm1",32); # carry = carry_high
127 &lea($r,&DWP(4,$r));
128 &jnz(&label("maw_sse2_loop"));
129 &set_label("maw_sse2_exit");
130 &movd("eax","mm1"); # c = carry_out
131 &emms();
132 &ret();
133
134 &set_label("maw_non_sse2",16);
135 }
136
137 # function_begin prologue
138 &push("ebp");
139 &push("ebx");
140 &push("esi");
141 &push("edi");
142
143 &comment("");
144 $Low="eax";
145 $High="edx";
146 $a="ebx";
147 $w="ebp";
148 $r="edi";
149 $c="esi";
150
151 &xor($c,$c); # clear carry
152 &mov($r,&wparam(0)); #
153
154 &mov("ecx",&wparam(2)); #
155 &mov($a,&wparam(1)); #
156
157 &and("ecx",0xfffffff8); # num / 8
158 &mov($w,&wparam(3)); #
159
160 &push("ecx"); # Up the stack for a tmp variable
161
162 &jz(&label("maw_finish"));
163
164 &set_label("maw_loop",16);
165
166 for ($i=0; $i<32; $i+=4)
167 {
168 &comment("Round $i");
169
170 &mov("eax",&DWP($i,$a)); # *a
171 &mul($w); # *a * w
172 &add("eax",$c); # L(t)+= c
173 &adc("edx",0); # H(t)+=carry
174 &add("eax",&DWP($i,$r)); # L(t)+= *r
175 &adc("edx",0); # H(t)+=carry
176 &mov(&DWP($i,$r),"eax"); # *r= L(t);
177 &mov($c,"edx"); # c= H(t);
178 }
179
180 &comment("");
181 &sub("ecx",8);
182 &lea($a,&DWP(32,$a));
183 &lea($r,&DWP(32,$r));
184 &jnz(&label("maw_loop"));
185
186 &set_label("maw_finish",0);
187 &mov("ecx",&wparam(2)); # get num
188 &and("ecx",7);
189 &jnz(&label("maw_finish2")); # helps branch prediction
190 &jmp(&label("maw_end"));
191
192 &set_label("maw_finish2",1);
193 for ($i=0; $i<7; $i++)
194 {
195 &comment("Tail Round $i");
196 &mov("eax",&DWP($i*4,$a)); # *a
197 &mul($w); # *a * w
198 &add("eax",$c); # L(t)+=c
199 &adc("edx",0); # H(t)+=carry
200 &add("eax",&DWP($i*4,$r)); # L(t)+= *r
201 &adc("edx",0); # H(t)+=carry
202 &dec("ecx") if ($i != 7-1);
203 &mov(&DWP($i*4,$r),"eax"); # *r= L(t);
204 &mov($c,"edx"); # c= H(t);
205 &jz(&label("maw_end")) if ($i != 7-1);
206 }
207 &set_label("maw_end",0);
208 &mov("eax",$c);
209
210 &pop("ecx"); # clear variable from
211
212 &function_end($name);
213 }
214
215sub bn_mul_words
216 {
217 local($name)=@_;
218
219 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
220
221 $r="eax";
222 $a="edx";
223 $c="ecx";
224
225 if ($sse2) {
226 &picmeup("eax","OPENSSL_ia32cap_P");
227 &bt(&DWP(0,"eax"),26);
228 &jnc(&label("mw_non_sse2"));
229
230 &mov($r,&wparam(0));
231 &mov($a,&wparam(1));
232 &mov($c,&wparam(2));
233 &movd("mm0",&wparam(3)); # mm0 = w
234 &pxor("mm1","mm1"); # mm1 = carry = 0
235
236 &set_label("mw_sse2_loop",16);
237 &movd("mm2",&DWP(0,$a)); # mm2 = a[i]
238 &pmuludq("mm2","mm0"); # a[i] *= w
239 &lea($a,&DWP(4,$a));
240 &paddq("mm1","mm2"); # carry += a[i]*w
241 &movd(&DWP(0,$r),"mm1"); # r[i] = carry_low
242 &sub($c,1);
243 &psrlq("mm1",32); # carry = carry_high
244 &lea($r,&DWP(4,$r));
245 &jnz(&label("mw_sse2_loop"));
246
247 &movd("eax","mm1"); # return carry
248 &emms();
249 &ret();
250 &set_label("mw_non_sse2",16);
251 }
252
253 # function_begin prologue
254 &push("ebp");
255 &push("ebx");
256 &push("esi");
257 &push("edi");
258
259 &comment("");
260 $Low="eax";
261 $High="edx";
262 $a="ebx";
263 $w="ecx";
264 $r="edi";
265 $c="esi";
266 $num="ebp";
267
268 &xor($c,$c); # clear carry
269 &mov($r,&wparam(0)); #
270 &mov($a,&wparam(1)); #
271 &mov($num,&wparam(2)); #
272 &mov($w,&wparam(3)); #
273
274 &and($num,0xfffffff8); # num / 8
275 &jz(&label("mw_finish"));
276
277 &set_label("mw_loop",0);
278 for ($i=0; $i<32; $i+=4)
279 {
280 &comment("Round $i");
281
282 &mov("eax",&DWP($i,$a,"",0)); # *a
283 &mul($w); # *a * w
284 &add("eax",$c); # L(t)+=c
285 # XXX
286
287 &adc("edx",0); # H(t)+=carry
288 &mov(&DWP($i,$r,"",0),"eax"); # *r= L(t);
289
290 &mov($c,"edx"); # c= H(t);
291 }
292
293 &comment("");
294 &add($a,32);
295 &add($r,32);
296 &sub($num,8);
297 &jz(&label("mw_finish"));
298 &jmp(&label("mw_loop"));
299
300 &set_label("mw_finish",0);
301 &mov($num,&wparam(2)); # get num
302 &and($num,7);
303 &jnz(&label("mw_finish2"));
304 &jmp(&label("mw_end"));
305
306 &set_label("mw_finish2",1);
307 for ($i=0; $i<7; $i++)
308 {
309 &comment("Tail Round $i");
310 &mov("eax",&DWP($i*4,$a,"",0));# *a
311 &mul($w); # *a * w
312 &add("eax",$c); # L(t)+=c
313 # XXX
314 &adc("edx",0); # H(t)+=carry
315 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
316 &mov($c,"edx"); # c= H(t);
317 &dec($num) if ($i != 7-1);
318 &jz(&label("mw_end")) if ($i != 7-1);
319 }
320 &set_label("mw_end",0);
321 &mov("eax",$c);
322
323 &function_end($name);
324 }
325
326sub bn_sqr_words
327 {
328 local($name)=@_;
329
330 &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
331
332 $r="eax";
333 $a="edx";
334 $c="ecx";
335
336 if ($sse2) {
337 &picmeup("eax","OPENSSL_ia32cap_P");
338 &bt(&DWP(0,"eax"),26);
339 &jnc(&label("sqr_non_sse2"));
340
341 &mov($r,&wparam(0));
342 &mov($a,&wparam(1));
343 &mov($c,&wparam(2));
344
345 &set_label("sqr_sse2_loop",16);
346 &movd("mm0",&DWP(0,$a)); # mm0 = a[i]
347 &pmuludq("mm0","mm0"); # a[i] *= a[i]
348 &lea($a,&DWP(4,$a)); # a++
349 &movq(&QWP(0,$r),"mm0"); # r[i] = a[i]*a[i]
350 &sub($c,1);
351 &lea($r,&DWP(8,$r)); # r += 2
352 &jnz(&label("sqr_sse2_loop"));
353
354 &emms();
355 &ret();
356 &set_label("sqr_non_sse2",16);
357 }
358
359 # function_begin prologue
360 &push("ebp");
361 &push("ebx");
362 &push("esi");
363 &push("edi");
364
365 &comment("");
366 $r="esi";
367 $a="edi";
368 $num="ebx";
369
370 &mov($r,&wparam(0)); #
371 &mov($a,&wparam(1)); #
372 &mov($num,&wparam(2)); #
373
374 &and($num,0xfffffff8); # num / 8
375 &jz(&label("sw_finish"));
376
377 &set_label("sw_loop",0);
378 for ($i=0; $i<32; $i+=4)
379 {
380 &comment("Round $i");
381 &mov("eax",&DWP($i,$a,"",0)); # *a
382 # XXX
383 &mul("eax"); # *a * *a
384 &mov(&DWP($i*2,$r,"",0),"eax"); #
385 &mov(&DWP($i*2+4,$r,"",0),"edx");#
386 }
387
388 &comment("");
389 &add($a,32);
390 &add($r,64);
391 &sub($num,8);
392 &jnz(&label("sw_loop"));
393
394 &set_label("sw_finish",0);
395 &mov($num,&wparam(2)); # get num
396 &and($num,7);
397 &jz(&label("sw_end"));
398
399 for ($i=0; $i<7; $i++)
400 {
401 &comment("Tail Round $i");
402 &mov("eax",&DWP($i*4,$a,"",0)); # *a
403 # XXX
404 &mul("eax"); # *a * *a
405 &mov(&DWP($i*8,$r,"",0),"eax"); #
406 &dec($num) if ($i != 7-1);
407 &mov(&DWP($i*8+4,$r,"",0),"edx");
408 &jz(&label("sw_end")) if ($i != 7-1);
409 }
410 &set_label("sw_end",0);
411
412 &function_end($name);
413 }
414
415sub bn_div_words
416 {
417 local($name)=@_;
418
419 &function_begin_B($name,"");
420 &mov("edx",&wparam(0)); #
421 &mov("eax",&wparam(1)); #
422 &mov("ecx",&wparam(2)); #
423 &div("ecx");
424 &ret();
425 &function_end_B($name);
426 }
427
428sub bn_add_words
429 {
430 local($name)=@_;
431
432 &function_begin($name,"");
433
434 &comment("");
435 $a="esi";
436 $b="edi";
437 $c="eax";
438 $r="ebx";
439 $tmp1="ecx";
440 $tmp2="edx";
441 $num="ebp";
442
443 &mov($r,&wparam(0)); # get r
444 &mov($a,&wparam(1)); # get a
445 &mov($b,&wparam(2)); # get b
446 &mov($num,&wparam(3)); # get num
447 &xor($c,$c); # clear carry
448 &and($num,0xfffffff8); # num / 8
449
450 &jz(&label("aw_finish"));
451
452 &set_label("aw_loop",0);
453 for ($i=0; $i<8; $i++)
454 {
455 &comment("Round $i");
456
457 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
458 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
459 &add($tmp1,$c);
460 &mov($c,0);
461 &adc($c,$c);
462 &add($tmp1,$tmp2);
463 &adc($c,0);
464 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
465 }
466
467 &comment("");
468 &add($a,32);
469 &add($b,32);
470 &add($r,32);
471 &sub($num,8);
472 &jnz(&label("aw_loop"));
473
474 &set_label("aw_finish",0);
475 &mov($num,&wparam(3)); # get num
476 &and($num,7);
477 &jz(&label("aw_end"));
478
479 for ($i=0; $i<7; $i++)
480 {
481 &comment("Tail Round $i");
482 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
483 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
484 &add($tmp1,$c);
485 &mov($c,0);
486 &adc($c,$c);
487 &add($tmp1,$tmp2);
488 &adc($c,0);
489 &dec($num) if ($i != 6);
490 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
491 &jz(&label("aw_end")) if ($i != 6);
492 }
493 &set_label("aw_end",0);
494
495# &mov("eax",$c); # $c is "eax"
496
497 &function_end($name);
498 }
499
500sub bn_sub_words
501 {
502 local($name)=@_;
503
504 &function_begin($name,"");
505
506 &comment("");
507 $a="esi";
508 $b="edi";
509 $c="eax";
510 $r="ebx";
511 $tmp1="ecx";
512 $tmp2="edx";
513 $num="ebp";
514
515 &mov($r,&wparam(0)); # get r
516 &mov($a,&wparam(1)); # get a
517 &mov($b,&wparam(2)); # get b
518 &mov($num,&wparam(3)); # get num
519 &xor($c,$c); # clear carry
520 &and($num,0xfffffff8); # num / 8
521
522 &jz(&label("aw_finish"));
523
524 &set_label("aw_loop",0);
525 for ($i=0; $i<8; $i++)
526 {
527 &comment("Round $i");
528
529 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
530 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
531 &sub($tmp1,$c);
532 &mov($c,0);
533 &adc($c,$c);
534 &sub($tmp1,$tmp2);
535 &adc($c,0);
536 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
537 }
538
539 &comment("");
540 &add($a,32);
541 &add($b,32);
542 &add($r,32);
543 &sub($num,8);
544 &jnz(&label("aw_loop"));
545
546 &set_label("aw_finish",0);
547 &mov($num,&wparam(3)); # get num
548 &and($num,7);
549 &jz(&label("aw_end"));
550
551 for ($i=0; $i<7; $i++)
552 {
553 &comment("Tail Round $i");
554 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
555 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
556 &sub($tmp1,$c);
557 &mov($c,0);
558 &adc($c,$c);
559 &sub($tmp1,$tmp2);
560 &adc($c,0);
561 &dec($num) if ($i != 6);
562 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
563 &jz(&label("aw_end")) if ($i != 6);
564 }
565 &set_label("aw_end",0);
566
567# &mov("eax",$c); # $c is "eax"
568
569 &function_end($name);
570 }
571
572sub bn_sub_part_words
573 {
574 local($name)=@_;
575
576 &function_begin($name,"");
577
578 &comment("");
579 $a="esi";
580 $b="edi";
581 $c="eax";
582 $r="ebx";
583 $tmp1="ecx";
584 $tmp2="edx";
585 $num="ebp";
586
587 &mov($r,&wparam(0)); # get r
588 &mov($a,&wparam(1)); # get a
589 &mov($b,&wparam(2)); # get b
590 &mov($num,&wparam(3)); # get num
591 &xor($c,$c); # clear carry
592 &and($num,0xfffffff8); # num / 8
593
594 &jz(&label("aw_finish"));
595
596 &set_label("aw_loop",0);
597 for ($i=0; $i<8; $i++)
598 {
599 &comment("Round $i");
600
601 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
602 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
603 &sub($tmp1,$c);
604 &mov($c,0);
605 &adc($c,$c);
606 &sub($tmp1,$tmp2);
607 &adc($c,0);
608 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
609 }
610
611 &comment("");
612 &add($a,32);
613 &add($b,32);
614 &add($r,32);
615 &sub($num,8);
616 &jnz(&label("aw_loop"));
617
618 &set_label("aw_finish",0);
619 &mov($num,&wparam(3)); # get num
620 &and($num,7);
621 &jz(&label("aw_end"));
622
623 for ($i=0; $i<7; $i++)
624 {
625 &comment("Tail Round $i");
626 &mov($tmp1,&DWP(0,$a,"",0)); # *a
627 &mov($tmp2,&DWP(0,$b,"",0));# *b
628 &sub($tmp1,$c);
629 &mov($c,0);
630 &adc($c,$c);
631 &sub($tmp1,$tmp2);
632 &adc($c,0);
633 &mov(&DWP(0,$r,"",0),$tmp1); # *r
634 &add($a, 4);
635 &add($b, 4);
636 &add($r, 4);
637 &dec($num) if ($i != 6);
638 &jz(&label("aw_end")) if ($i != 6);
639 }
640 &set_label("aw_end",0);
641
642 &cmp(&wparam(4),0);
643 &je(&label("pw_end"));
644
645 &mov($num,&wparam(4)); # get dl
646 &cmp($num,0);
647 &je(&label("pw_end"));
648 &jge(&label("pw_pos"));
649
650 &comment("pw_neg");
651 &mov($tmp2,0);
652 &sub($tmp2,$num);
653 &mov($num,$tmp2);
654 &and($num,0xfffffff8); # num / 8
655 &jz(&label("pw_neg_finish"));
656
657 &set_label("pw_neg_loop",0);
658 for ($i=0; $i<8; $i++)
659 {
660 &comment("dl<0 Round $i");
661
662 &mov($tmp1,0);
663 &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
664 &sub($tmp1,$c);
665 &mov($c,0);
666 &adc($c,$c);
667 &sub($tmp1,$tmp2);
668 &adc($c,0);
669 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
670 }
Robert Sloana94fe052017-02-21 08:49:28 -0800671
Adam Langleyd9e397b2015-01-22 14:27:53 -0800672 &comment("");
673 &add($b,32);
674 &add($r,32);
675 &sub($num,8);
676 &jnz(&label("pw_neg_loop"));
Robert Sloana94fe052017-02-21 08:49:28 -0800677
Adam Langleyd9e397b2015-01-22 14:27:53 -0800678 &set_label("pw_neg_finish",0);
679 &mov($tmp2,&wparam(4)); # get dl
680 &mov($num,0);
681 &sub($num,$tmp2);
682 &and($num,7);
683 &jz(&label("pw_end"));
Robert Sloana94fe052017-02-21 08:49:28 -0800684
Adam Langleyd9e397b2015-01-22 14:27:53 -0800685 for ($i=0; $i<7; $i++)
686 {
687 &comment("dl<0 Tail Round $i");
688 &mov($tmp1,0);
689 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
690 &sub($tmp1,$c);
691 &mov($c,0);
692 &adc($c,$c);
693 &sub($tmp1,$tmp2);
694 &adc($c,0);
695 &dec($num) if ($i != 6);
696 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
697 &jz(&label("pw_end")) if ($i != 6);
698 }
699
700 &jmp(&label("pw_end"));
Robert Sloana94fe052017-02-21 08:49:28 -0800701
Adam Langleyd9e397b2015-01-22 14:27:53 -0800702 &set_label("pw_pos",0);
Robert Sloana94fe052017-02-21 08:49:28 -0800703
Adam Langleyd9e397b2015-01-22 14:27:53 -0800704 &and($num,0xfffffff8); # num / 8
705 &jz(&label("pw_pos_finish"));
706
707 &set_label("pw_pos_loop",0);
708
709 for ($i=0; $i<8; $i++)
710 {
711 &comment("dl>0 Round $i");
712
713 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
714 &sub($tmp1,$c);
715 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
716 &jnc(&label("pw_nc".$i));
717 }
Robert Sloana94fe052017-02-21 08:49:28 -0800718
Adam Langleyd9e397b2015-01-22 14:27:53 -0800719 &comment("");
720 &add($a,32);
721 &add($r,32);
722 &sub($num,8);
723 &jnz(&label("pw_pos_loop"));
Robert Sloana94fe052017-02-21 08:49:28 -0800724
Adam Langleyd9e397b2015-01-22 14:27:53 -0800725 &set_label("pw_pos_finish",0);
726 &mov($num,&wparam(4)); # get dl
727 &and($num,7);
728 &jz(&label("pw_end"));
Robert Sloana94fe052017-02-21 08:49:28 -0800729
Adam Langleyd9e397b2015-01-22 14:27:53 -0800730 for ($i=0; $i<7; $i++)
731 {
732 &comment("dl>0 Tail Round $i");
733 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
734 &sub($tmp1,$c);
735 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
736 &jnc(&label("pw_tail_nc".$i));
737 &dec($num) if ($i != 6);
738 &jz(&label("pw_end")) if ($i != 6);
739 }
740 &mov($c,1);
741 &jmp(&label("pw_end"));
742
743 &set_label("pw_nc_loop",0);
744 for ($i=0; $i<8; $i++)
745 {
746 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
747 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
748 &set_label("pw_nc".$i,0);
749 }
Robert Sloana94fe052017-02-21 08:49:28 -0800750
Adam Langleyd9e397b2015-01-22 14:27:53 -0800751 &comment("");
752 &add($a,32);
753 &add($r,32);
754 &sub($num,8);
755 &jnz(&label("pw_nc_loop"));
Robert Sloana94fe052017-02-21 08:49:28 -0800756
Adam Langleyd9e397b2015-01-22 14:27:53 -0800757 &mov($num,&wparam(4)); # get dl
758 &and($num,7);
759 &jz(&label("pw_nc_end"));
Robert Sloana94fe052017-02-21 08:49:28 -0800760
Adam Langleyd9e397b2015-01-22 14:27:53 -0800761 for ($i=0; $i<7; $i++)
762 {
763 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
764 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
765 &set_label("pw_tail_nc".$i,0);
766 &dec($num) if ($i != 6);
767 &jz(&label("pw_nc_end")) if ($i != 6);
768 }
769
770 &set_label("pw_nc_end",0);
771 &mov($c,0);
772
773 &set_label("pw_end",0);
774
775# &mov("eax",$c); # $c is "eax"
776
777 &function_end($name);
778 }