external/boringssl: Sync to 6d50f475e319de153a43e1dba5a1beca95948c63.
This includes the following changes:
https://boringssl.googlesource.com/boringssl/+log/0726fb76ebe7f422e3c4fb2e25a0064926975770..6d50f475e319de153a43e1dba5a1beca95948c63
This also updates the build files to add the new GTest-based targets and
work with the C++ file in libssl.
Test: cts-tradefed run cts -m CtsLibcoreOkHttpTestCases -a arm64-v8a
Test: cts-tradefed run cts -m CtsLibcoreTestCases -a arm64-v8a
Change-Id: I99718d51c901fe2e2e1e0398fc61fe1e76ccdb3f
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index 97fea5f..36224fc 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -207,3 +207,17 @@
target_link_libraries(refcount_test crypto)
add_dependencies(all_tests refcount_test)
+
+# TODO(davidben): Convert the remaining tests to GTest.
+add_executable(
+ crypto_test
+
+ dh/dh_test.cc
+ dsa/dsa_test.cc
+
+ $<TARGET_OBJECTS:gtest_main>
+ $<TARGET_OBJECTS:test_support>
+)
+
+target_link_libraries(crypto_test crypto gtest)
+add_dependencies(all_tests crypto_test)
diff --git a/src/crypto/cipher/CMakeLists.txt b/src/crypto/cipher/CMakeLists.txt
index 52b87b6..db46c4b 100644
--- a/src/crypto/cipher/CMakeLists.txt
+++ b/src/crypto/cipher/CMakeLists.txt
@@ -1,5 +1,13 @@
include_directories(../../include)
+if (${ARCH} STREQUAL "x86_64")
+ set(
+ CIPHER_ARCH_SOURCES
+
+ chacha20_poly1305_x86_64.${ASM_EXT}
+ )
+endif()
+
add_library(
cipher
@@ -19,6 +27,8 @@
tls_cbc.c
e_tls.c
e_ssl3.c
+
+ ${CIPHER_ARCH_SOURCES}
)
add_executable(
@@ -35,6 +45,8 @@
$<TARGET_OBJECTS:test_support>
)
+perlasm(chacha20_poly1305_x86_64.${ASM_EXT} asm/chacha20_poly1305_x86_64.pl)
+
target_link_libraries(cipher_test crypto)
target_link_libraries(aead_test crypto)
add_dependencies(all_tests cipher_test aead_test)
diff --git a/src/crypto/cipher/aead_test.cc b/src/crypto/cipher/aead_test.cc
index 0c95fb4..fb5200e 100644
--- a/src/crypto/cipher/aead_test.cc
+++ b/src/crypto/cipher/aead_test.cc
@@ -12,6 +12,7 @@
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+#include <assert.h>
#include <stdint.h>
#include <string.h>
@@ -168,16 +169,12 @@
}
static int TestCleanupAfterInitFailure(const EVP_AEAD *aead) {
- EVP_AEAD_CTX ctx;
- uint8_t key[128];
-
+ uint8_t key[EVP_AEAD_MAX_KEY_LENGTH];
OPENSSL_memset(key, 0, sizeof(key));
const size_t key_len = EVP_AEAD_key_length(aead);
- if (key_len > sizeof(key)) {
- fprintf(stderr, "Key length of AEAD too long.\n");
- return 0;
- }
+ assert(sizeof(key) >= key_len);
+ EVP_AEAD_CTX ctx;
if (EVP_AEAD_CTX_init(&ctx, aead, key, key_len,
9999 /* a silly tag length to trigger an error */,
NULL /* ENGINE */) != 0) {
@@ -201,6 +198,80 @@
return 1;
}
+static int TestTruncatedTags(const EVP_AEAD *aead) {
+ uint8_t key[EVP_AEAD_MAX_KEY_LENGTH];
+ OPENSSL_memset(key, 0, sizeof(key));
+ const size_t key_len = EVP_AEAD_key_length(aead);
+ assert(sizeof(key) >= key_len);
+
+ uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH];
+ OPENSSL_memset(nonce, 0, sizeof(nonce));
+ const size_t nonce_len = EVP_AEAD_nonce_length(aead);
+ assert(sizeof(nonce) >= nonce_len);
+
+ bssl::ScopedEVP_AEAD_CTX ctx;
+ if (!EVP_AEAD_CTX_init(ctx.get(), aead, key, key_len, 1 /* one byte tag */,
+ NULL /* ENGINE */)) {
+ fprintf(stderr, "Couldn't initialise AEAD with truncated tag.\n");
+ return 1;
+ }
+
+ const uint8_t plaintext[1] = {'A'};
+
+ uint8_t ciphertext[128];
+ size_t ciphertext_len;
+ constexpr uint8_t kSentinel = 42;
+ OPENSSL_memset(ciphertext, kSentinel, sizeof(ciphertext));
+
+ if (!EVP_AEAD_CTX_seal(ctx.get(), ciphertext, &ciphertext_len,
+ sizeof(ciphertext), nonce, nonce_len, plaintext,
+ sizeof(plaintext), nullptr /* ad */, 0)) {
+ fprintf(stderr, "Sealing with truncated tag didn't work.\n");
+ return 0;
+ }
+
+ for (size_t i = ciphertext_len; i < sizeof(ciphertext); i++) {
+ // Sealing must not write past where it said it did.
+ if (ciphertext[i] != kSentinel) {
+ fprintf(stderr, "Sealing wrote off the end of the buffer.\n");
+ return 0;
+ }
+ }
+
+ const size_t overhead_used = ciphertext_len - sizeof(plaintext);
+ if (overhead_used != 1) {
+ fprintf(stderr, "AEAD is probably ignoring request to truncate tags.\n");
+ return 0;
+ }
+
+ uint8_t plaintext2[sizeof(plaintext) + 16];
+ OPENSSL_memset(plaintext2, kSentinel, sizeof(plaintext2));
+
+ size_t plaintext2_len;
+ if (!EVP_AEAD_CTX_open(ctx.get(), plaintext2, &plaintext2_len,
+ sizeof(plaintext2), nonce, nonce_len, ciphertext,
+ ciphertext_len, nullptr /* ad */, 0)) {
+ fprintf(stderr, "Opening with truncated tag didn't work.\n");
+ return 0;
+ }
+
+ for (size_t i = plaintext2_len; i < sizeof(plaintext2); i++) {
+ // Likewise, opening should also stay within bounds.
+ if (plaintext2[i] != kSentinel) {
+ fprintf(stderr, "Opening wrote off the end of the buffer.\n");
+ return 0;
+ }
+ }
+
+ if (plaintext2_len != sizeof(plaintext) ||
+ OPENSSL_memcmp(plaintext2, plaintext, sizeof(plaintext)) != 0) {
+ fprintf(stderr, "Opening with truncated tag gave wrong result.\n");
+ return 0;
+ }
+
+ return 1;
+}
+
static bool TestWithAliasedBuffers(const EVP_AEAD *aead) {
const size_t key_len = EVP_AEAD_key_length(aead);
const size_t nonce_len = EVP_AEAD_nonce_length(aead);
@@ -306,30 +377,32 @@
// handle inputs that are a multiple of eight bytes in length and the
// SSLv3/TLS AEADs have the concept of “direction”.
bool limited_implementation;
+ // truncated_tags is true if the AEAD supports truncating tags to arbitrary
+ // lengths.
+ bool truncated_tags;
};
static const struct KnownAEAD kAEADs[] = {
- { "aes-128-gcm", EVP_aead_aes_128_gcm, false },
- { "aes-256-gcm", EVP_aead_aes_256_gcm, false },
- { "aes-128-gcm-siv", EVP_aead_aes_128_gcm_siv, false },
- { "aes-256-gcm-siv", EVP_aead_aes_256_gcm_siv, false },
- { "chacha20-poly1305", EVP_aead_chacha20_poly1305, false },
- { "chacha20-poly1305-old", EVP_aead_chacha20_poly1305_old, false },
- { "aes-128-cbc-sha1-tls", EVP_aead_aes_128_cbc_sha1_tls, true },
- { "aes-128-cbc-sha1-tls-implicit-iv", EVP_aead_aes_128_cbc_sha1_tls_implicit_iv, true },
- { "aes-128-cbc-sha256-tls", EVP_aead_aes_128_cbc_sha256_tls, true },
- { "aes-256-cbc-sha1-tls", EVP_aead_aes_256_cbc_sha1_tls, true },
- { "aes-256-cbc-sha1-tls-implicit-iv", EVP_aead_aes_256_cbc_sha1_tls_implicit_iv, true },
- { "aes-256-cbc-sha256-tls", EVP_aead_aes_256_cbc_sha256_tls, true },
- { "aes-256-cbc-sha384-tls", EVP_aead_aes_256_cbc_sha384_tls, true },
- { "des-ede3-cbc-sha1-tls", EVP_aead_des_ede3_cbc_sha1_tls, true },
- { "des-ede3-cbc-sha1-tls-implicit-iv", EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv, true },
- { "aes-128-cbc-sha1-ssl3", EVP_aead_aes_128_cbc_sha1_ssl3, true },
- { "aes-256-cbc-sha1-ssl3", EVP_aead_aes_256_cbc_sha1_ssl3, true },
- { "des-ede3-cbc-sha1-ssl3", EVP_aead_des_ede3_cbc_sha1_ssl3, true },
- { "aes-128-ctr-hmac-sha256", EVP_aead_aes_128_ctr_hmac_sha256, false },
- { "aes-256-ctr-hmac-sha256", EVP_aead_aes_256_ctr_hmac_sha256, false },
- { "", NULL, false },
+ { "aes-128-gcm", EVP_aead_aes_128_gcm, false, true },
+ { "aes-256-gcm", EVP_aead_aes_256_gcm, false, true },
+ { "aes-128-gcm-siv", EVP_aead_aes_128_gcm_siv, false, false },
+ { "aes-256-gcm-siv", EVP_aead_aes_256_gcm_siv, false, false },
+ { "chacha20-poly1305", EVP_aead_chacha20_poly1305, false, true },
+ { "aes-128-cbc-sha1-tls", EVP_aead_aes_128_cbc_sha1_tls, true, false },
+ { "aes-128-cbc-sha1-tls-implicit-iv", EVP_aead_aes_128_cbc_sha1_tls_implicit_iv, true, false },
+ { "aes-128-cbc-sha256-tls", EVP_aead_aes_128_cbc_sha256_tls, true, false },
+ { "aes-256-cbc-sha1-tls", EVP_aead_aes_256_cbc_sha1_tls, true, false },
+ { "aes-256-cbc-sha1-tls-implicit-iv", EVP_aead_aes_256_cbc_sha1_tls_implicit_iv, true, false },
+ { "aes-256-cbc-sha256-tls", EVP_aead_aes_256_cbc_sha256_tls, true, false },
+ { "aes-256-cbc-sha384-tls", EVP_aead_aes_256_cbc_sha384_tls, true, false },
+ { "des-ede3-cbc-sha1-tls", EVP_aead_des_ede3_cbc_sha1_tls, true, false },
+ { "des-ede3-cbc-sha1-tls-implicit-iv", EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv, true, false },
+ { "aes-128-cbc-sha1-ssl3", EVP_aead_aes_128_cbc_sha1_ssl3, true, false },
+ { "aes-256-cbc-sha1-ssl3", EVP_aead_aes_256_cbc_sha1_ssl3, true, false },
+ { "des-ede3-cbc-sha1-ssl3", EVP_aead_des_ede3_cbc_sha1_ssl3, true, false },
+ { "aes-128-ctr-hmac-sha256", EVP_aead_aes_128_ctr_hmac_sha256, false, true },
+ { "aes-256-ctr-hmac-sha256", EVP_aead_aes_256_ctr_hmac_sha256, false, true },
+ { "", NULL, false, false },
};
int main(int argc, char **argv) {
@@ -363,6 +436,11 @@
return 1;
}
+ if (known_aead->truncated_tags && !TestTruncatedTags(aead)) {
+ fprintf(stderr, "Truncated tags test failed for %s.\n", known_aead->name);
+ return 1;
+ }
+
if (!known_aead->limited_implementation && !TestWithAliasedBuffers(aead)) {
fprintf(stderr, "Aliased buffers test failed for %s.\n", known_aead->name);
return 1;
diff --git a/src/crypto/cipher/asm/chacha20_poly1305_x86_64.pl b/src/crypto/cipher/asm/chacha20_poly1305_x86_64.pl
new file mode 100644
index 0000000..c3f3e0b
--- /dev/null
+++ b/src/crypto/cipher/asm/chacha20_poly1305_x86_64.pl
@@ -0,0 +1,2379 @@
+#!/usr/bin/env perl
+
+# Copyright (c) 2015, CloudFlare Ltd.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+##############################################################################
+# #
+# Author: Vlad Krasnov #
+# #
+##############################################################################
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$avx = 2;
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+.align 64
+.chacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.rol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.rol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.avx2_init:
+.long 0,0,0,0
+.sse_inc:
+.long 1,0,0,0
+.avx2_inc:
+.long 2,0,0,0,2,0,0,0
+.clamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align 16
+.and_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+___
+
+my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
+my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
+my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
+my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
+my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
+my $r_store="0*16(%rbp)";
+my $s_store="1*16(%rbp)";
+my $len_store="2*16(%rbp)";
+my $state1_store="3*16(%rbp)";
+my $state2_store="4*16(%rbp)";
+my $tmp_store="5*16(%rbp)";
+my $ctr0_store="6*16(%rbp)";
+my $ctr1_store="7*16(%rbp)";
+my $ctr2_store="8*16(%rbp)";
+my $ctr3_store="9*16(%rbp)";
+
+sub chacha_qr {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
+$code.="paddd $b, $a
+ pxor $a, $d
+ pshufb .rol16(%rip), $d
+ paddd $d, $c
+ pxor $c, $b
+ movdqa $b, $t
+ pslld \$12, $t
+ psrld \$20, $b
+ pxor $t, $b
+ paddd $b, $a
+ pxor $a, $d
+ pshufb .rol8(%rip), $d
+ paddd $d, $c
+ pxor $c, $b
+ movdqa $b, $t
+ pslld \$7, $t
+ psrld \$25, $b
+ pxor $t, $b\n";
+$code.="palignr \$4, $b, $b
+ palignr \$8, $c, $c
+ palignr \$12, $d, $d\n" if ($dir =~ /left/);
+$code.="palignr \$12, $b, $b
+ palignr \$8, $c, $c
+ palignr \$4, $d, $d\n" if ($dir =~ /right/);
+$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
+}
+
+sub poly_add {
+my ($src)=@_;
+$code.="add $src, $acc0
+ adc 8+$src, $acc1
+ adc \$1, $acc2\n";
+}
+
+sub poly_stage1 {
+$code.="mov 0+$r_store, %rax
+ mov %rax, $t2
+ mul $acc0
+ mov %rax, $t0
+ mov %rdx, $t1
+ mov 0+$r_store, %rax
+ mul $acc1
+ imul $acc2, $t2
+ add %rax, $t1
+ adc %rdx, $t2\n";
+}
+
+sub poly_stage2 {
+$code.="mov 8+$r_store, %rax
+ mov %rax, $t3
+ mul $acc0
+ add %rax, $t1
+ adc \$0, %rdx
+ mov %rdx, $acc0
+ mov 8+$r_store, %rax
+ mul $acc1
+ add %rax, $t2
+ adc \$0, %rdx\n";
+}
+
+sub poly_stage3 {
+$code.="imul $acc2, $t3
+ add $acc0, $t2
+ adc %rdx, $t3\n";
+}
+
+sub poly_reduce_stage {
+$code.="mov $t0, $acc0
+ mov $t1, $acc1
+ mov $t2, $acc2
+ and \$3, $acc2
+ mov $t2, $t0
+ and \$-4, $t0
+ mov $t3, $t1
+ shrd \$2, $t3, $t2
+ shr \$2, $t3
+ add $t0, $acc0
+ adc $t1, $acc1
+ adc \$0, $acc2
+ add $t2, $acc0
+ adc $t3, $acc1
+ adc \$0, $acc2\n";
+}
+
+sub poly_mul {
+ &poly_stage1();
+ &poly_stage2();
+ &poly_stage3();
+ &poly_reduce_stage();
+}
+
+sub prep_state {
+my ($n)=@_;
+$code.="movdqa .chacha20_consts(%rip), $A0
+ movdqa $state1_store, $B0
+ movdqa $state2_store, $C0\n";
+$code.="movdqa $A0, $A1
+ movdqa $B0, $B1
+ movdqa $C0, $C1\n" if ($n ge 2);
+$code.="movdqa $A0, $A2
+ movdqa $B0, $B2
+ movdqa $C0, $C2\n" if ($n ge 3);
+$code.="movdqa $A0, $A3
+ movdqa $B0, $B3
+ movdqa $C0, $C3\n" if ($n ge 4);
+$code.="movdqa $ctr0_store, $D0
+ paddd .sse_inc(%rip), $D0
+ movdqa $D0, $ctr0_store\n" if ($n eq 1);
+$code.="movdqa $ctr0_store, $D1
+ paddd .sse_inc(%rip), $D1
+ movdqa $D1, $D0
+ paddd .sse_inc(%rip), $D0
+ movdqa $D0, $ctr0_store
+ movdqa $D1, $ctr1_store\n" if ($n eq 2);
+$code.="movdqa $ctr0_store, $D2
+ paddd .sse_inc(%rip), $D2
+ movdqa $D2, $D1
+ paddd .sse_inc(%rip), $D1
+ movdqa $D1, $D0
+ paddd .sse_inc(%rip), $D0
+ movdqa $D0, $ctr0_store
+ movdqa $D1, $ctr1_store
+ movdqa $D2, $ctr2_store\n" if ($n eq 3);
+$code.="movdqa $ctr0_store, $D3
+ paddd .sse_inc(%rip), $D3
+ movdqa $D3, $D2
+ paddd .sse_inc(%rip), $D2
+ movdqa $D2, $D1
+ paddd .sse_inc(%rip), $D1
+ movdqa $D1, $D0
+ paddd .sse_inc(%rip), $D0
+ movdqa $D0, $ctr0_store
+ movdqa $D1, $ctr1_store
+ movdqa $D2, $ctr2_store
+ movdqa $D3, $ctr3_store\n" if ($n eq 4);
+}
+
+sub finalize_state {
+my ($n)=@_;
+$code.="paddd .chacha20_consts(%rip), $A3
+ paddd $state1_store, $B3
+ paddd $state2_store, $C3
+ paddd $ctr3_store, $D3\n" if ($n eq 4);
+$code.="paddd .chacha20_consts(%rip), $A2
+ paddd $state1_store, $B2
+ paddd $state2_store, $C2
+ paddd $ctr2_store, $D2\n" if ($n ge 3);
+$code.="paddd .chacha20_consts(%rip), $A1
+ paddd $state1_store, $B1
+ paddd $state2_store, $C1
+ paddd $ctr1_store, $D1\n" if ($n ge 2);
+$code.="paddd .chacha20_consts(%rip), $A0
+ paddd $state1_store, $B0
+ paddd $state2_store, $C0
+ paddd $ctr0_store, $D0\n";
+}
+
+sub xor_stream {
+my ($A, $B, $C, $D, $offset)=@_;
+$code.="movdqu 0*16 + $offset($inp), $A3
+ movdqu 1*16 + $offset($inp), $B3
+ movdqu 2*16 + $offset($inp), $C3
+ movdqu 3*16 + $offset($inp), $D3
+ pxor $A3, $A
+ pxor $B3, $B
+ pxor $C3, $C
+ pxor $D, $D3
+ movdqu $A, 0*16 + $offset($oup)
+ movdqu $B, 1*16 + $offset($oup)
+ movdqu $C, 2*16 + $offset($oup)
+ movdqu $D3, 3*16 + $offset($oup)\n";
+}
+
+sub xor_stream_using_temp {
+my ($A, $B, $C, $D, $offset, $temp)=@_;
+$code.="movdqa $temp, $tmp_store
+ movdqu 0*16 + $offset($inp), $temp
+ pxor $A, $temp
+ movdqu $temp, 0*16 + $offset($oup)
+ movdqu 1*16 + $offset($inp), $temp
+ pxor $B, $temp
+ movdqu $temp, 1*16 + $offset($oup)
+ movdqu 2*16 + $offset($inp), $temp
+ pxor $C, $temp
+ movdqu $temp, 2*16 + $offset($oup)
+ movdqu 3*16 + $offset($inp), $temp
+ pxor $D, $temp
+ movdqu $temp, 3*16 + $offset($oup)\n";
+}
+
+sub gen_chacha_round {
+my ($rot1, $rot2, $shift)=@_;
+my $round="";
+$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
+$round.="movdqa $rot2, $C0
+ paddd $B3, $A3
+ paddd $B2, $A2
+ paddd $B1, $A1
+ paddd $B0, $A0
+ pxor $A3, $D3
+ pxor $A2, $D2
+ pxor $A1, $D1
+ pxor $A0, $D0
+ pshufb $C0, $D3
+ pshufb $C0, $D2
+ pshufb $C0, $D1
+ pshufb $C0, $D0
+ movdqa $tmp_store, $C0
+ paddd $D3, $C3
+ paddd $D2, $C2
+ paddd $D1, $C1
+ paddd $D0, $C0
+ pxor $C3, $B3
+ pxor $C2, $B2
+ pxor $C1, $B1
+ pxor $C0, $B0
+ movdqa $C0, $tmp_store
+ movdqa $B3, $C0
+ psrld \$$rot1, $C0
+ pslld \$32-$rot1, $B3
+ pxor $C0, $B3
+ movdqa $B2, $C0
+ psrld \$$rot1, $C0
+ pslld \$32-$rot1, $B2
+ pxor $C0, $B2
+ movdqa $B1, $C0
+ psrld \$$rot1, $C0
+ pslld \$32-$rot1, $B1
+ pxor $C0, $B1
+ movdqa $B0, $C0
+ psrld \$$rot1, $C0
+ pslld \$32-$rot1, $B0
+ pxor $C0, $B0\n";
+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
+$round.="movdqa $tmp_store, $C0
+ palignr \$$s1, $B3, $B3
+ palignr \$$s2, $C3, $C3
+ palignr \$$s3, $D3, $D3
+ palignr \$$s1, $B2, $B2
+ palignr \$$s2, $C2, $C2
+ palignr \$$s3, $D2, $D2
+ palignr \$$s1, $B1, $B1
+ palignr \$$s2, $C1, $C1
+ palignr \$$s3, $D1, $D1
+ palignr \$$s1, $B0, $B0
+ palignr \$$s2, $C0, $C0
+ palignr \$$s3, $D0, $D0\n"
+if (($shift =~ /left/) || ($shift =~ /right/));
+return $round;
+};
+
+$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
+ &gen_chacha_round(25, ".rol8(%rip)", "left") .
+ &gen_chacha_round(20, ".rol16(%rip)") .
+ &gen_chacha_round(25, ".rol8(%rip)", "right");
+
+my @loop_body = split /\n/, $chacha_body;
+
+sub emit_body {
+my ($n)=@_;
+ for (my $i=0; $i < $n; $i++) {
+ $code=$code.shift(@loop_body)."\n";
+ };
+}
+
+{
+################################################################################
+# void poly_hash_ad_internal();
+$code.="
+.type poly_hash_ad_internal,\@function,2
+.align 64
+poly_hash_ad_internal:
+.cfi_startproc
+ xor $acc0, $acc0
+ xor $acc1, $acc1
+ xor $acc2, $acc2
+ cmp \$13, $itr2
+ jne hash_ad_loop
+poly_fast_tls_ad:
+ # Special treatment for the TLS case of 13 bytes
+ mov ($adp), $acc0
+ mov 5($adp), $acc1
+ shr \$24, $acc1
+ mov \$1, $acc2\n";
+ &poly_mul(); $code.="
+ ret
+hash_ad_loop:
+ # Hash in 16 byte chunk
+ cmp \$16, $itr2
+ jb hash_ad_tail\n";
+ &poly_add("0($adp)");
+ &poly_mul(); $code.="
+ lea (1*16)($adp), $adp
+ sub \$16, $itr2
+ jmp hash_ad_loop
+hash_ad_tail:
+ cmp \$0, $itr2
+ je 1f
+ # Hash last < 16 byte tail
+ xor $t0, $t0
+ xor $t1, $t1
+ xor $t2, $t2
+ add $itr2, $adp
+hash_ad_tail_loop:
+ shld \$8, $t0, $t1
+ shl \$8, $t0
+ movzxb -1($adp), $t2
+ xor $t2, $t0
+ dec $adp
+ dec $itr2
+ jne hash_ad_tail_loop
+
+ add $t0, $acc0
+ adc $t1, $acc1
+ adc \$1, $acc2\n";
+ &poly_mul(); $code.="
+ # Finished AD
+1:
+ ret
+.cfi_endproc
+.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
+}
+
+{
+################################################################################
+# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
+$code.="
+.globl chacha20_poly1305_open
+.type chacha20_poly1305_open,\@function,2
+.align 64
+chacha20_poly1305_open:
+.cfi_startproc
+ push %rbp
+.cfi_adjust_cfa_offset 8
+ push %rbx
+.cfi_adjust_cfa_offset 8
+ push %r12
+.cfi_adjust_cfa_offset 8
+ push %r13
+.cfi_adjust_cfa_offset 8
+ push %r14
+.cfi_adjust_cfa_offset 8
+ push %r15
+.cfi_adjust_cfa_offset 8
+ # We write the calculated authenticator back to keyp at the end, so save
+ # the pointer on the stack too.
+ push $keyp
+.cfi_adjust_cfa_offset 8
+ sub \$288 + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+.cfi_offset rbp, -16
+.cfi_offset rbx, -24
+.cfi_offset r12, -32
+.cfi_offset r13, -40
+.cfi_offset r14, -48
+.cfi_offset r15, -56
+.cfi_offset $keyp, -64
+ lea 32(%rsp), %rbp
+ and \$-32, %rbp
+ mov %rdx, 8+$len_store
+ mov %r8, 0+$len_store
+ mov %rdx, $inl\n"; $code.="
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
+ and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
+ xor \$`(1<<5) + (1<<8)`, %eax
+ jz chacha20_poly1305_open_avx2\n" if ($avx>1);
+$code.="
+1:
+ cmp \$128, $inl
+ jbe open_sse_128
+ # For long buffers, prepare the poly key first
+ movdqa .chacha20_consts(%rip), $A0
+ movdqu 0*16($keyp), $B0
+ movdqu 1*16($keyp), $C0
+ movdqu 2*16($keyp), $D0
+ movdqa $D0, $T1
+ # Store on stack, to free keyp
+ movdqa $B0, $state1_store
+ movdqa $C0, $state2_store
+ movdqa $D0, $ctr0_store
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+ dec $acc0
+ jne 1b
+ # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+ paddd .chacha20_consts(%rip), $A0
+ paddd $state1_store, $B0
+ # Clamp and store the key
+ pand .clamp(%rip), $A0
+ movdqa $A0, $r_store
+ movdqa $B0, $s_store
+ # Hash
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+open_sse_main_loop:
+ cmp \$16*16, $inl
+ jb 2f
+ # Load state, increment counter blocks\n";
+ &prep_state(4); $code.="
+ # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
+ # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+ mov \$4, $itr1
+ mov $inp, $itr2
+1: \n";
+ &emit_body(20);
+ &poly_add("0($itr2)"); $code.="
+ lea 2*8($itr2), $itr2\n";
+ &emit_body(20);
+ &poly_stage1();
+ &emit_body(20);
+ &poly_stage2();
+ &emit_body(20);
+ &poly_stage3();
+ &emit_body(20);
+ &poly_reduce_stage();
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ dec $itr1
+ jge 1b\n";
+ &poly_add("0($itr2)");
+ &poly_mul(); $code.="
+ lea 2*8($itr2), $itr2
+ cmp \$-6, $itr1
+ jg 1b\n";
+ &finalize_state(4);
+ &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
+ &xor_stream($A2, $B2, $C2, $D2, "4*16");
+ &xor_stream($A1, $B1, $C1, $D1, "8*16");
+ &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
+ lea 16*16($inp), $inp
+ lea 16*16($oup), $oup
+ sub \$16*16, $inl
+ jmp open_sse_main_loop
+2:
+ # Handle the various tail sizes efficiently
+ test $inl, $inl
+ jz open_sse_finalize
+ cmp \$4*16, $inl
+ ja 3f\n";
+###############################################################################
+ # At most 64 bytes are left
+ &prep_state(1); $code.="
+ xor $itr2, $itr2
+ mov $inl, $itr1
+ cmp \$16, $itr1
+ jb 2f
+1: \n";
+ &poly_add("0($inp, $itr2)");
+ &poly_mul(); $code.="
+ sub \$16, $itr1
+2:
+ add \$16, $itr2\n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+ cmp \$16, $itr1
+ jae 1b
+ cmp \$10*16, $itr2
+ jne 2b\n";
+ &finalize_state(1); $code.="
+ jmp open_sse_tail_64_dec_loop
+3:
+ cmp \$8*16, $inl
+ ja 3f\n";
+###############################################################################
+ # 65 - 128 bytes are left
+ &prep_state(2); $code.="
+ mov $inl, $itr1
+ and \$-16, $itr1
+ xor $itr2, $itr2
+1: \n";
+ &poly_add("0($inp, $itr2)");
+ &poly_mul(); $code.="
+2:
+ add \$16, $itr2\n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
+ cmp $itr1, $itr2
+ jb 1b
+ cmp \$10*16, $itr2
+ jne 2b\n";
+ &finalize_state(2);
+ &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
+ sub \$4*16, $inl
+ lea 4*16($inp), $inp
+ lea 4*16($oup), $oup
+ jmp open_sse_tail_64_dec_loop
+3:
+ cmp \$12*16, $inl
+ ja 3f\n";
+###############################################################################
+ # 129 - 192 bytes are left
+ &prep_state(3); $code.="
+ mov $inl, $itr1
+ mov \$10*16, $itr2
+ cmp \$10*16, $itr1
+ cmovg $itr2, $itr1
+ and \$-16, $itr1
+ xor $itr2, $itr2
+1: \n";
+ &poly_add("0($inp, $itr2)");
+ &poly_mul(); $code.="
+2:
+ add \$16, $itr2\n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ cmp $itr1, $itr2
+ jb 1b
+ cmp \$10*16, $itr2
+ jne 2b
+ cmp \$11*16, $inl
+ jb 1f\n";
+ &poly_add("10*16($inp)");
+ &poly_mul(); $code.="
+ cmp \$12*16, $inl
+ jb 1f\n";
+ &poly_add("11*16($inp)");
+ &poly_mul(); $code.="
+1: \n";
+ &finalize_state(3);
+ &xor_stream($A2, $B2, $C2, $D2, "0*16");
+ &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
+ sub \$8*16, $inl
+ lea 8*16($inp), $inp
+ lea 8*16($oup), $oup
+ jmp open_sse_tail_64_dec_loop
+3:
+###############################################################################\n";
+ # 193 - 255 bytes are left
+ &prep_state(4); $code.="
+ xor $itr2, $itr2
+1: \n";
+ &poly_add("0($inp, $itr2)");
+ &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
+ &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
+ &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
+ &poly_stage1();
+ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
+ &poly_stage2();
+ &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
+ &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
+ &poly_stage3();
+ &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
+ &poly_reduce_stage();
+ &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
+ add \$16, $itr2
+ cmp \$10*16, $itr2
+ jb 1b
+ mov $inl, $itr1
+ and \$-16, $itr1
+1: \n";
+ &poly_add("0($inp, $itr2)");
+ &poly_mul(); $code.="
+ add \$16, $itr2
+ cmp $itr1, $itr2
+ jb 1b\n";
+ &finalize_state(4);
+ &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
+ &xor_stream($A2, $B2, $C2, $D2, "4*16");
+ &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
+ movdqa $tmp_store, $D0
+ sub \$12*16, $inl
+ lea 12*16($inp), $inp
+ lea 12*16($oup), $oup
+###############################################################################
+ # Decrypt the remaining data, 16B at a time, using existing stream
+open_sse_tail_64_dec_loop:
+ cmp \$16, $inl
+ jb 1f
+ sub \$16, $inl
+ movdqu ($inp), $T0
+ pxor $T0, $A0
+ movdqu $A0, ($oup)
+ lea 16($inp), $inp
+ lea 16($oup), $oup
+ movdqa $B0, $A0
+ movdqa $C0, $B0
+ movdqa $D0, $C0
+ jmp open_sse_tail_64_dec_loop
+1:
+ movdqa $A0, $A1
+
+ # Decrypt up to 16 bytes at the end.
+open_sse_tail_16:
+ test $inl, $inl
+ jz open_sse_finalize
+
+ # Read the final bytes into $T0. They need to be read in reverse order so
+ # that they end up in the correct order in $T0.
+ pxor $T0, $T0
+ lea -1($inp, $inl), $inp
+ movq $inl, $itr2
+2:
+ pslldq \$1, $T0
+ pinsrb \$0, ($inp), $T0
+ sub \$1, $inp
+ sub \$1, $itr2
+ jnz 2b
+
+3:
+ movq $T0, $t0
+ pextrq \$1, $T0, $t1
+ # The final bytes of keystream are in $A1.
+ pxor $A1, $T0
+
+ # Copy the plaintext bytes out.
+2:
+ pextrb \$0, $T0, ($oup)
+ psrldq \$1, $T0
+ add \$1, $oup
+ sub \$1, $inl
+ jne 2b
+
+ add $t0, $acc0
+ adc $t1, $acc1
+ adc \$1, $acc2\n";
+ &poly_mul(); $code.="
+
+open_sse_finalize:\n";
+ &poly_add($len_store);
+ &poly_mul(); $code.="
+ # Final reduce
+ mov $acc0, $t0
+ mov $acc1, $t1
+ mov $acc2, $t2
+ sub \$-5, $acc0
+ sbb \$-1, $acc1
+ sbb \$3, $acc2
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+ cmovc $t2, $acc2
+ # Add in s part of the key
+ add 0+$s_store, $acc0
+ adc 8+$s_store, $acc1
+
+ add \$288 + 32, %rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+ pop $keyp
+.cfi_adjust_cfa_offset -8
+ movq $acc0, ($keyp)
+ movq $acc1, 8($keyp)
+
+ pop %r15
+.cfi_adjust_cfa_offset -8
+ pop %r14
+.cfi_adjust_cfa_offset -8
+ pop %r13
+.cfi_adjust_cfa_offset -8
+ pop %r12
+.cfi_adjust_cfa_offset -8
+ pop %rbx
+.cfi_adjust_cfa_offset -8
+ pop %rbp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+###############################################################################
+open_sse_128:
+ movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+ movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
+ movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
+ movdqu 2*16($keyp), $D0
+ movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
+ movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
+ movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ dec $acc0
+ jnz 1b
+ paddd .chacha20_consts(%rip), $A0
+ paddd .chacha20_consts(%rip), $A1
+ paddd .chacha20_consts(%rip), $A2
+ paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
+ paddd $T2, $C1\npaddd $T2, $C2
+ paddd $T3, $D1
+ paddd .sse_inc(%rip), $T3
+ paddd $T3, $D2
+ # Clamp and store the key
+ pand .clamp(%rip), $A0
+ movdqa $A0, $r_store
+ movdqa $B0, $s_store
+ # Hash
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+1:
+ cmp \$16, $inl
+ jb open_sse_tail_16
+ sub \$16, $inl\n";
+ # Load for hashing
+ &poly_add("0*8($inp)"); $code.="
+ # Load for decryption
+ movdqu 0*16($inp), $T0
+ pxor $T0, $A1
+ movdqu $A1, 0*16($oup)
+ lea 1*16($inp), $inp
+ lea 1*16($oup), $oup\n";
+ &poly_mul(); $code.="
+ # Shift the stream left
+ movdqa $B1, $A1
+ movdqa $C1, $B1
+ movdqa $D1, $C1
+ movdqa $A2, $D1
+ movdqa $B2, $A2
+ movdqa $C2, $B2
+ movdqa $D2, $C2
+ jmp 1b
+ jmp open_sse_tail_16
+.size chacha20_poly1305_open, .-chacha20_poly1305_open
+.cfi_endproc
+
+################################################################################
+################################################################################
+# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
+.globl chacha20_poly1305_seal
+.type chacha20_poly1305_seal,\@function,2
+.align 64
+chacha20_poly1305_seal:
+.cfi_startproc
+ push %rbp
+.cfi_adjust_cfa_offset 8
+ push %rbx
+.cfi_adjust_cfa_offset 8
+ push %r12
+.cfi_adjust_cfa_offset 8
+ push %r13
+.cfi_adjust_cfa_offset 8
+ push %r14
+.cfi_adjust_cfa_offset 8
+ push %r15
+.cfi_adjust_cfa_offset 8
+ # We write the calculated authenticator back to keyp at the end, so save
+ # the pointer on the stack too.
+ push $keyp
+.cfi_adjust_cfa_offset 8
+ sub \$288 + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+.cfi_offset rbp, -16
+.cfi_offset rbx, -24
+.cfi_offset r12, -32
+.cfi_offset r13, -40
+.cfi_offset r14, -48
+.cfi_offset r15, -56
+.cfi_offset $keyp, -64
+ lea 32(%rsp), %rbp
+ and \$-32, %rbp
+ mov %rdx, 8+$len_store
+ mov %r8, 0+$len_store
+ mov %rdx, $inl\n"; $code.="
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
+ and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
+ xor \$`(1<<5) + (1<<8)`, %eax
+ jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
+$code.="
+ cmp \$128, $inl
+ jbe seal_sse_128
+ # For longer buffers, prepare the poly key + some stream
+ movdqa .chacha20_consts(%rip), $A0
+ movdqu 0*16($keyp), $B0
+ movdqu 1*16($keyp), $C0
+ movdqu 2*16($keyp), $D0
+ movdqa $A0, $A1
+ movdqa $A0, $A2
+ movdqa $A0, $A3
+ movdqa $B0, $B1
+ movdqa $B0, $B2
+ movdqa $B0, $B3
+ movdqa $C0, $C1
+ movdqa $C0, $C2
+ movdqa $C0, $C3
+ movdqa $D0, $D3
+ paddd .sse_inc(%rip), $D0
+ movdqa $D0, $D2
+ paddd .sse_inc(%rip), $D0
+ movdqa $D0, $D1
+ paddd .sse_inc(%rip), $D0
+ # Store on stack
+ movdqa $B0, $state1_store
+ movdqa $C0, $state2_store
+ movdqa $D0, $ctr0_store
+ movdqa $D1, $ctr1_store
+ movdqa $D2, $ctr2_store
+ movdqa $D3, $ctr3_store
+ mov \$10, $acc0
+1: \n";
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ dec $acc0
+ jnz 1b\n";
+ &finalize_state(4); $code.="
+ # Clamp and store the key
+ pand .clamp(%rip), $A3
+ movdqa $A3, $r_store
+ movdqa $B3, $s_store
+ # Hash
+ mov %r8, $itr2
+ call poly_hash_ad_internal\n";
+ &xor_stream($A2,$B2,$C2,$D2,"0*16");
+ &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
+ cmp \$12*16, $inl
+ ja 1f
+ mov \$8*16, $itr1
+ sub \$8*16, $inl
+ lea 8*16($inp), $inp
+ jmp seal_sse_128_seal_hash
+1: \n";
+ &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
+ mov \$12*16, $itr1
+ sub \$12*16, $inl
+ lea 12*16($inp), $inp
+ mov \$2, $itr1
+ mov \$8, $itr2
+ cmp \$4*16, $inl
+ jbe seal_sse_tail_64
+ cmp \$8*16, $inl
+ jbe seal_sse_tail_128
+ cmp \$12*16, $inl
+ jbe seal_sse_tail_192
+
+1: \n";
+ # The main loop
+ &prep_state(4); $code.="
+2: \n";
+ &emit_body(20);
+ &poly_add("0($oup)");
+ &emit_body(20);
+ &poly_stage1();
+ &emit_body(20);
+ &poly_stage2();
+ &emit_body(20);
+ &poly_stage3();
+ &emit_body(20);
+ &poly_reduce_stage();
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ lea 16($oup), $oup
+ dec $itr2
+ jge 2b\n";
+ &poly_add("0*8($oup)");
+ &poly_mul(); $code.="
+ lea 16($oup), $oup
+ dec $itr1
+ jg 2b\n";
+
+ &finalize_state(4);$code.="
+ movdqa $D2, $tmp_store\n";
+ &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
+ movdqa $tmp_store, $D2\n";
+ &xor_stream($A2,$B2,$C2,$D2, 4*16);
+ &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
+ cmp \$16*16, $inl
+ ja 3f
+
+ mov \$12*16, $itr1
+ sub \$12*16, $inl
+ lea 12*16($inp), $inp
+ jmp seal_sse_128_seal_hash
+3: \n";
+ &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
+ lea 16*16($inp), $inp
+ sub \$16*16, $inl
+ mov \$6, $itr1
+ mov \$4, $itr2
+ cmp \$12*16, $inl
+ jg 1b
+ mov $inl, $itr1
+ test $inl, $inl
+ je seal_sse_128_seal_hash
+ mov \$6, $itr1
+ cmp \$4*16, $inl
+ jg 3f
+###############################################################################
+seal_sse_tail_64:\n";
+ &prep_state(1); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 16($oup), $oup
+2: \n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 16($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state(1); $code.="
+ jmp seal_sse_128_seal
+3:
+ cmp \$8*16, $inl
+ jg 3f
+###############################################################################
+seal_sse_tail_128:\n";
+ &prep_state(2); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 16($oup), $oup
+2: \n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+ &poly_add("0($oup)");
+ &poly_mul();
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+ lea 16($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state(2);
+ &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
+ mov \$4*16, $itr1
+ sub \$4*16, $inl
+ lea 4*16($inp), $inp
+ jmp seal_sse_128_seal_hash
+3:
+###############################################################################
+seal_sse_tail_192:\n";
+ &prep_state(3); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 16($oup), $oup
+2: \n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+ &poly_add("0($oup)");
+ &poly_mul();
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ lea 16($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state(3);
+ &xor_stream($A2,$B2,$C2,$D2,0*16);
+ &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
+ mov \$8*16, $itr1
+ sub \$8*16, $inl
+ lea 8*16($inp), $inp
+###############################################################################
+seal_sse_128_seal_hash:
+ cmp \$16, $itr1
+ jb seal_sse_128_seal\n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ sub \$16, $itr1
+ lea 16($oup), $oup
+ jmp seal_sse_128_seal_hash
+
+seal_sse_128_seal:
+ cmp \$16, $inl
+ jb seal_sse_tail_16
+ sub \$16, $inl
+ # Load for decryption
+ movdqu 0*16($inp), $T0
+ pxor $T0, $A0
+ movdqu $A0, 0*16($oup)
+ # Then hash
+ add 0*8($oup), $acc0
+ adc 1*8($oup), $acc1
+ adc \$1, $acc2
+ lea 1*16($inp), $inp
+ lea 1*16($oup), $oup\n";
+ &poly_mul(); $code.="
+ # Shift the stream left
+ movdqa $B0, $A0
+ movdqa $C0, $B0
+ movdqa $D0, $C0
+ movdqa $A1, $D0
+ movdqa $B1, $A1
+ movdqa $C1, $B1
+ movdqa $D1, $C1
+ jmp seal_sse_128_seal
+
+seal_sse_tail_16:
+ test $inl, $inl
+ jz seal_sse_finalize
+ # We can only load the PT one byte at a time to avoid buffer overread
+ mov $inl, $itr2
+ shl \$4, $itr2
+ lea .and_masks(%rip), $t0
+ mov $inl, $itr1
+ lea -1($inp, $inl), $inp
+ pxor $T3, $T3
+1:
+ pslldq \$1, $T3
+ pinsrb \$0, ($inp), $T3
+ lea -1($inp), $inp
+ dec $itr1
+ jne 1b
+
+ # XOR the keystream with the plaintext.
+ pxor $A0, $T3
+
+ # Write ciphertext out, byte-by-byte.
+ movq $inl, $itr1
+ movdqu $T3, $A0
+2:
+ pextrb \$0, $A0, ($oup)
+ psrldq \$1, $A0
+ add \$1, $oup
+ sub \$1, $itr1
+ jnz 2b
+
+ pand -16($t0, $itr2), $T3
+ movq $T3, $t0
+ pextrq \$1, $T3, $t1
+ add $t0, $acc0
+ adc $t1, $acc1
+ adc \$1, $acc2\n";
+ &poly_mul(); $code.="
+seal_sse_finalize:\n";
+ &poly_add($len_store);
+ &poly_mul(); $code.="
+ # Final reduce
+ mov $acc0, $t0
+ mov $acc1, $t1
+ mov $acc2, $t2
+ sub \$-5, $acc0
+ sbb \$-1, $acc1
+ sbb \$3, $acc2
+ cmovc $t0, $acc0
+ cmovc $t1, $acc1
+ cmovc $t2, $acc2
+ # Add in s part of the key
+ add 0+$s_store, $acc0
+ adc 8+$s_store, $acc1
+
+ add \$288 + 32, %rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+ pop $keyp
+.cfi_adjust_cfa_offset -8
+ mov $acc0, 0*8($keyp)
+ mov $acc1, 1*8($keyp)
+
+ pop %r15
+.cfi_adjust_cfa_offset -8
+ pop %r14
+.cfi_adjust_cfa_offset -8
+ pop %r13
+.cfi_adjust_cfa_offset -8
+ pop %r12
+.cfi_adjust_cfa_offset -8
+ pop %rbx
+.cfi_adjust_cfa_offset -8
+ pop %rbp
+.cfi_adjust_cfa_offset -8
+ ret
+.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+################################################################################
+seal_sse_128:
+ movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+ movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
+ movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
+ movdqu 2*16($keyp), $D2
+ movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
+ movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
+ movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
+ mov \$10, $acc0
+1:\n";
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ dec $acc0
+ jnz 1b
+ paddd .chacha20_consts(%rip), $A0
+ paddd .chacha20_consts(%rip), $A1
+ paddd .chacha20_consts(%rip), $A2
+ paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
+ paddd $T2, $C0\npaddd $T2, $C1
+ paddd $T3, $D0
+ paddd .sse_inc(%rip), $T3
+ paddd $T3, $D1
+ # Clamp and store the key
+ pand .clamp(%rip), $A2
+ movdqa $A2, $r_store
+ movdqa $B2, $s_store
+ # Hash
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+ jmp seal_sse_128_seal
+.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
+}
+
+# There should have been a cfi_endproc at the end of that function, but the two
+# following blocks of code are jumped to without a stack frame and the CFI
+# context which they are used in happens to match the CFI context at the end of
+# the previous function. So the CFI table is just extended to the end of them.
+
+if ($avx>1) {
+
+($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
+my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
+($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
+$state1_store="2*32(%rbp)";
+$state2_store="3*32(%rbp)";
+$tmp_store="4*32(%rbp)";
+$ctr0_store="5*32(%rbp)";
+$ctr1_store="6*32(%rbp)";
+$ctr2_store="7*32(%rbp)";
+$ctr3_store="8*32(%rbp)";
+
+sub chacha_qr_avx2 {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+$code.=<<___ if ($dir =~ /store/);
+ vmovdqa $t, $tmp_store
+___
+$code.=<<___;
+ vpaddd $b, $a, $a
+ vpxor $a, $d, $d
+ vpshufb .rol16(%rip), $d, $d
+ vpaddd $d, $c, $c
+ vpxor $c, $b, $b
+ vpsrld \$20, $b, $t
+ vpslld \$12, $b, $b
+ vpxor $t, $b, $b
+ vpaddd $b, $a, $a
+ vpxor $a, $d, $d
+ vpshufb .rol8(%rip), $d, $d
+ vpaddd $d, $c, $c
+ vpxor $c, $b, $b
+ vpslld \$7, $b, $t
+ vpsrld \$25, $b, $b
+ vpxor $t, $b, $b
+___
+$code.=<<___ if ($dir =~ /left/);
+ vpalignr \$12, $d, $d, $d
+ vpalignr \$8, $c, $c, $c
+ vpalignr \$4, $b, $b, $b
+___
+$code.=<<___ if ($dir =~ /right/);
+ vpalignr \$4, $d, $d, $d
+ vpalignr \$8, $c, $c, $c
+ vpalignr \$12, $b, $b, $b
+___
+$code.=<<___ if ($dir =~ /load/);
+ vmovdqa $tmp_store, $t
+___
+}
+
+sub prep_state_avx2 {
+my ($n)=@_;
+$code.=<<___;
+ vmovdqa .chacha20_consts(%rip), $A0
+ vmovdqa $state1_store, $B0
+ vmovdqa $state2_store, $C0
+___
+$code.=<<___ if ($n ge 2);
+ vmovdqa $A0, $A1
+ vmovdqa $B0, $B1
+ vmovdqa $C0, $C1
+___
+$code.=<<___ if ($n ge 3);
+ vmovdqa $A0, $A2
+ vmovdqa $B0, $B2
+ vmovdqa $C0, $C2
+___
+$code.=<<___ if ($n ge 4);
+ vmovdqa $A0, $A3
+ vmovdqa $B0, $B3
+ vmovdqa $C0, $C3
+___
+$code.=<<___ if ($n eq 1);
+ vmovdqa .avx2_inc(%rip), $D0
+ vpaddd $ctr0_store, $D0, $D0
+ vmovdqa $D0, $ctr0_store
+___
+$code.=<<___ if ($n eq 2);
+ vmovdqa .avx2_inc(%rip), $D0
+ vpaddd $ctr0_store, $D0, $D1
+ vpaddd $D1, $D0, $D0
+ vmovdqa $D0, $ctr0_store
+ vmovdqa $D1, $ctr1_store
+___
+$code.=<<___ if ($n eq 3);
+ vmovdqa .avx2_inc(%rip), $D0
+ vpaddd $ctr0_store, $D0, $D2
+ vpaddd $D2, $D0, $D1
+ vpaddd $D1, $D0, $D0
+ vmovdqa $D0, $ctr0_store
+ vmovdqa $D1, $ctr1_store
+ vmovdqa $D2, $ctr2_store
+___
+$code.=<<___ if ($n eq 4);
+ vmovdqa .avx2_inc(%rip), $D0
+ vpaddd $ctr0_store, $D0, $D3
+ vpaddd $D3, $D0, $D2
+ vpaddd $D2, $D0, $D1
+ vpaddd $D1, $D0, $D0
+ vmovdqa $D3, $ctr3_store
+ vmovdqa $D2, $ctr2_store
+ vmovdqa $D1, $ctr1_store
+ vmovdqa $D0, $ctr0_store
+___
+}
+
+sub finalize_state_avx2 {
+my ($n)=@_;
+$code.=<<___ if ($n eq 4);
+ vpaddd .chacha20_consts(%rip), $A3, $A3
+ vpaddd $state1_store, $B3, $B3
+ vpaddd $state2_store, $C3, $C3
+ vpaddd $ctr3_store, $D3, $D3
+___
+$code.=<<___ if ($n ge 3);
+ vpaddd .chacha20_consts(%rip), $A2, $A2
+ vpaddd $state1_store, $B2, $B2
+ vpaddd $state2_store, $C2, $C2
+ vpaddd $ctr2_store, $D2, $D2
+___
+$code.=<<___ if ($n ge 2);
+ vpaddd .chacha20_consts(%rip), $A1, $A1
+ vpaddd $state1_store, $B1, $B1
+ vpaddd $state2_store, $C1, $C1
+ vpaddd $ctr1_store, $D1, $D1
+___
+$code.=<<___;
+ vpaddd .chacha20_consts(%rip), $A0, $A0
+ vpaddd $state1_store, $B0, $B0
+ vpaddd $state2_store, $C0, $C0
+ vpaddd $ctr0_store, $D0, $D0
+___
+}
+
+sub xor_stream_avx2 {
+my ($A, $B, $C, $D, $offset, $hlp)=@_;
+$code.=<<___;
+ vperm2i128 \$0x02, $A, $B, $hlp
+ vperm2i128 \$0x13, $A, $B, $B
+ vperm2i128 \$0x02, $C, $D, $A
+ vperm2i128 \$0x13, $C, $D, $C
+ vpxor 0*32+$offset($inp), $hlp, $hlp
+ vpxor 1*32+$offset($inp), $A, $A
+ vpxor 2*32+$offset($inp), $B, $B
+ vpxor 3*32+$offset($inp), $C, $C
+ vmovdqu $hlp, 0*32+$offset($oup)
+ vmovdqu $A, 1*32+$offset($oup)
+ vmovdqu $B, 2*32+$offset($oup)
+ vmovdqu $C, 3*32+$offset($oup)
+___
+}
+
+sub finish_stream_avx2 {
+my ($A, $B, $C, $D, $hlp)=@_;
+$code.=<<___;
+ vperm2i128 \$0x13, $A, $B, $hlp
+ vperm2i128 \$0x02, $A, $B, $A
+ vperm2i128 \$0x02, $C, $D, $B
+ vperm2i128 \$0x13, $C, $D, $D
+ vmovdqa $hlp, $C
+___
+}
+
+sub poly_stage1_mulx {
+$code.=<<___;
+ mov 0+$r_store, %rdx
+ mov %rdx, $t2
+ mulx $acc0, $t0, $t1
+ mulx $acc1, %rax, %rdx
+ imul $acc2, $t2
+ add %rax, $t1
+ adc %rdx, $t2
+___
+}
+
+sub poly_stage2_mulx {
+$code.=<<___;
+ mov 8+$r_store, %rdx
+ mulx $acc0, $acc0, %rax
+ add $acc0, $t1
+ mulx $acc1, $acc1, $t3
+ adc $acc1, $t2
+ adc \$0, $t3
+ imul $acc2, %rdx
+___
+}
+
+sub poly_stage3_mulx {
+$code.=<<___;
+ add %rax, $t2
+ adc %rdx, $t3
+___
+}
+
+sub poly_mul_mulx {
+ &poly_stage1_mulx();
+ &poly_stage2_mulx();
+ &poly_stage3_mulx();
+ &poly_reduce_stage();
+}
+
+sub gen_chacha_round_avx2 {
+my ($rot1, $rot2, $shift)=@_;
+my $round="";
+$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
+$round=$round ."vmovdqa $rot2, $C0
+ vpaddd $B3, $A3, $A3
+ vpaddd $B2, $A2, $A2
+ vpaddd $B1, $A1, $A1
+ vpaddd $B0, $A0, $A0
+ vpxor $A3, $D3, $D3
+ vpxor $A2, $D2, $D2
+ vpxor $A1, $D1, $D1
+ vpxor $A0, $D0, $D0
+ vpshufb $C0, $D3, $D3
+ vpshufb $C0, $D2, $D2
+ vpshufb $C0, $D1, $D1
+ vpshufb $C0, $D0, $D0
+ vmovdqa $tmp_store, $C0
+ vpaddd $D3, $C3, $C3
+ vpaddd $D2, $C2, $C2
+ vpaddd $D1, $C1, $C1
+ vpaddd $D0, $C0, $C0
+ vpxor $C3, $B3, $B3
+ vpxor $C2, $B2, $B2
+ vpxor $C1, $B1, $B1
+ vpxor $C0, $B0, $B0
+ vmovdqa $C0, $tmp_store
+ vpsrld \$$rot1, $B3, $C0
+ vpslld \$32-$rot1, $B3, $B3
+ vpxor $C0, $B3, $B3
+ vpsrld \$$rot1, $B2, $C0
+ vpslld \$32-$rot1, $B2, $B2
+ vpxor $C0, $B2, $B2
+ vpsrld \$$rot1, $B1, $C0
+ vpslld \$32-$rot1, $B1, $B1
+ vpxor $C0, $B1, $B1
+ vpsrld \$$rot1, $B0, $C0
+ vpslld \$32-$rot1, $B0, $B0
+ vpxor $C0, $B0, $B0\n";
+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
+$round=$round ."vmovdqa $tmp_store, $C0
+ vpalignr \$$s1, $B3, $B3, $B3
+ vpalignr \$$s2, $C3, $C3, $C3
+ vpalignr \$$s3, $D3, $D3, $D3
+ vpalignr \$$s1, $B2, $B2, $B2
+ vpalignr \$$s2, $C2, $C2, $C2
+ vpalignr \$$s3, $D2, $D2, $D2
+ vpalignr \$$s1, $B1, $B1, $B1
+ vpalignr \$$s2, $C1, $C1, $C1
+ vpalignr \$$s3, $D1, $D1, $D1
+ vpalignr \$$s1, $B0, $B0, $B0
+ vpalignr \$$s2, $C0, $C0, $C0
+ vpalignr \$$s3, $D0, $D0, $D0\n"
+if (($shift =~ /left/) || ($shift =~ /right/));
+return $round;
+};
+
+$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
+ &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
+ &gen_chacha_round_avx2(20, ".rol16(%rip)") .
+ &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
+
+@loop_body = split /\n/, $chacha_body;
+
+$code.="
+###############################################################################
+.type chacha20_poly1305_open_avx2,\@function,2
+.align 64
+chacha20_poly1305_open_avx2:
+ vzeroupper
+ vmovdqa .chacha20_consts(%rip), $A0
+ vbroadcasti128 0*16($keyp), $B0
+ vbroadcasti128 1*16($keyp), $C0
+ vbroadcasti128 2*16($keyp), $D0
+ vpaddd .avx2_init(%rip), $D0, $D0
+ cmp \$6*32, $inl
+ jbe open_avx2_192
+ cmp \$10*32, $inl
+ jbe open_avx2_320
+
+ vmovdqa $B0, $state1_store
+ vmovdqa $C0, $state2_store
+ vmovdqa $D0, $ctr0_store
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+ dec $acc0
+ jne 1b
+ vpaddd .chacha20_consts(%rip), $A0, $A0
+ vpaddd $state1_store, $B0, $B0
+ vpaddd $state2_store, $C0, $C0
+ vpaddd $ctr0_store, $D0, $D0
+
+ vperm2i128 \$0x02, $A0, $B0, $T0
+ # Clamp and store key
+ vpand .clamp(%rip), $T0, $T0
+ vmovdqa $T0, $r_store
+ # Stream for the first 64 bytes
+ vperm2i128 \$0x13, $A0, $B0, $A0
+ vperm2i128 \$0x13, $C0, $D0, $B0
+ # Hash AD + first 64 bytes
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+ xor $itr1, $itr1
+ # Hash first 64 bytes
+1: \n";
+ &poly_add("0($inp, $itr1)");
+ &poly_mul(); $code.="
+ add \$16, $itr1
+ cmp \$2*32, $itr1
+ jne 1b
+ # Decrypt first 64 bytes
+ vpxor 0*32($inp), $A0, $A0
+ vpxor 1*32($inp), $B0, $B0
+ vmovdqu $A0, 0*32($oup)
+ vmovdqu $B0, 1*32($oup)
+ lea 2*32($inp), $inp
+ lea 2*32($oup), $oup
+ sub \$2*32, $inl
+1:
+ # Hash and decrypt 512 bytes each iteration
+ cmp \$16*32, $inl
+ jb 3f\n";
+ &prep_state_avx2(4); $code.="
+ xor $itr1, $itr1
+2: \n";
+ &poly_add("0*8($inp, $itr1)");
+ &emit_body(10);
+ &poly_stage1_mulx();
+ &emit_body(9);
+ &poly_stage2_mulx();
+ &emit_body(12);
+ &poly_stage3_mulx();
+ &emit_body(10);
+ &poly_reduce_stage();
+ &emit_body(9);
+ &poly_add("2*8($inp, $itr1)");
+ &emit_body(8);
+ &poly_stage1_mulx();
+ &emit_body(18);
+ &poly_stage2_mulx();
+ &emit_body(18);
+ &poly_stage3_mulx();
+ &emit_body(9);
+ &poly_reduce_stage();
+ &emit_body(8);
+ &poly_add("4*8($inp, $itr1)"); $code.="
+ lea 6*8($itr1), $itr1\n";
+ &emit_body(18);
+ &poly_stage1_mulx();
+ &emit_body(8);
+ &poly_stage2_mulx();
+ &emit_body(8);
+ &poly_stage3_mulx();
+ &emit_body(18);
+ &poly_reduce_stage();
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ cmp \$10*6*8, $itr1
+ jne 2b\n";
+ &finalize_state_avx2(4); $code.="
+ vmovdqa $A0, $tmp_store\n";
+ &poly_add("10*6*8($inp)");
+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+ vmovdqa $tmp_store, $A0\n";
+ &poly_mul();
+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+ &poly_add("10*6*8+2*8($inp)");
+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+ &poly_mul();
+ &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
+ lea 16*32($inp), $inp
+ lea 16*32($oup), $oup
+ sub \$16*32, $inl
+ jmp 1b
+3:
+ test $inl, $inl
+ vzeroupper
+ je open_sse_finalize
+3:
+ cmp \$4*32, $inl
+ ja 3f\n";
+###############################################################################
+ # 1-128 bytes left
+ &prep_state_avx2(1); $code.="
+ xor $itr2, $itr2
+ mov $inl, $itr1
+ and \$-16, $itr1
+ test $itr1, $itr1
+ je 2f
+1: \n";
+ &poly_add("0*8($inp, $itr2)");
+ &poly_mul(); $code.="
+2:
+ add \$16, $itr2\n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+ cmp $itr1, $itr2
+ jb 1b
+ cmp \$160, $itr2
+ jne 2b\n";
+ &finalize_state_avx2(1);
+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+ jmp open_avx2_tail_loop
+3:
+ cmp \$8*32, $inl
+ ja 3f\n";
+###############################################################################
+ # 129-256 bytes left
+ &prep_state_avx2(2); $code.="
+ mov $inl, $tmp_store
+ mov $inl, $itr1
+ sub \$4*32, $itr1
+ shr \$4, $itr1
+ mov \$10, $itr2
+ cmp \$10, $itr1
+ cmovg $itr2, $itr1
+ mov $inp, $inl
+ xor $itr2, $itr2
+1: \n";
+ &poly_add("0*8($inl)");
+ &poly_mul_mulx(); $code.="
+ lea 16($inl), $inl
+2: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
+ inc $itr2\n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ cmp $itr1, $itr2
+ jb 1b
+ cmp \$10, $itr2
+ jne 2b
+ mov $inl, $itr2
+ sub $inp, $inl
+ mov $inl, $itr1
+ mov $tmp_store, $inl
+1:
+ add \$16, $itr1
+ cmp $inl, $itr1
+ jg 1f\n";
+ &poly_add("0*8($itr2)");
+ &poly_mul_mulx(); $code.="
+ lea 16($itr2), $itr2
+ jmp 1b
+1: \n";
+ &finalize_state_avx2(2);
+ &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
+ &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
+ lea 4*32($inp), $inp
+ lea 4*32($oup), $oup
+ sub \$4*32, $inl
+ jmp open_avx2_tail_loop
+3:
+ cmp \$12*32, $inl
+ ja 3f\n";
+###############################################################################
+ # 257-383 bytes left
+ &prep_state_avx2(3); $code.="
+ mov $inl, $tmp_store
+ mov $inl, $itr1
+ sub \$8*32, $itr1
+ shr \$4, $itr1
+ add \$6, $itr1
+ mov \$10, $itr2
+ cmp \$10, $itr1
+ cmovg $itr2, $itr1
+ mov $inp, $inl
+ xor $itr2, $itr2
+1: \n";
+ &poly_add("0*8($inl)");
+ &poly_mul_mulx(); $code.="
+ lea 16($inl), $inl
+2: \n";
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &poly_add("0*8($inl)");
+ &poly_mul(); $code.="
+ lea 16($inl), $inl
+ inc $itr2\n";
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+ cmp $itr1, $itr2
+ jb 1b
+ cmp \$10, $itr2
+ jne 2b
+ mov $inl, $itr2
+ sub $inp, $inl
+ mov $inl, $itr1
+ mov $tmp_store, $inl
+1:
+ add \$16, $itr1
+ cmp $inl, $itr1
+ jg 1f\n";
+ &poly_add("0*8($itr2)");
+ &poly_mul_mulx(); $code.="
+ lea 16($itr2), $itr2
+ jmp 1b
+1: \n";
+ &finalize_state_avx2(3);
+ &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
+ &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
+ &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
+ lea 8*32($inp), $inp
+ lea 8*32($oup), $oup
+ sub \$8*32, $inl
+ jmp open_avx2_tail_loop
+3: \n";
+###############################################################################
+ # 384-512 bytes left
+ &prep_state_avx2(4); $code.="
+ xor $itr1, $itr1
+ mov $inp, $itr2
+1: \n";
+ &poly_add("0*8($itr2)");
+ &poly_mul(); $code.="
+ lea 2*8($itr2), $itr2
+2: \n";
+ &emit_body(37);
+ &poly_add("0*8($itr2)");
+ &poly_mul_mulx();
+ &emit_body(48);
+ &poly_add("2*8($itr2)");
+ &poly_mul_mulx(); $code.="
+ lea 4*8($itr2), $itr2\n";
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ inc $itr1
+ cmp \$4, $itr1
+ jl 1b
+ cmp \$10, $itr1
+ jne 2b
+ mov $inl, $itr1
+ sub \$12*32, $itr1
+ and \$-16, $itr1
+1:
+ test $itr1, $itr1
+ je 1f\n";
+ &poly_add("0*8($itr2)");
+ &poly_mul_mulx(); $code.="
+ lea 2*8($itr2), $itr2
+ sub \$2*8, $itr1
+ jmp 1b
+1: \n";
+ &finalize_state_avx2(4); $code.="
+ vmovdqa $A0, $tmp_store\n";
+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+ vmovdqa $tmp_store, $A0\n";
+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+ &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
+ lea 12*32($inp), $inp
+ lea 12*32($oup), $oup
+ sub \$12*32, $inl
+open_avx2_tail_loop:
+ cmp \$32, $inl
+ jb open_avx2_tail
+ sub \$32, $inl
+ vpxor ($inp), $A0, $A0
+ vmovdqu $A0, ($oup)
+ lea 1*32($inp), $inp
+ lea 1*32($oup), $oup
+ vmovdqa $B0, $A0
+ vmovdqa $C0, $B0
+ vmovdqa $D0, $C0
+ jmp open_avx2_tail_loop
+open_avx2_tail:
+ cmp \$16, $inl
+ vmovdqa $A0x, $A1x
+ jb 1f
+ sub \$16, $inl
+ #load for decryption
+ vpxor ($inp), $A0x, $A1x
+ vmovdqu $A1x, ($oup)
+ lea 1*16($inp), $inp
+ lea 1*16($oup), $oup
+ vperm2i128 \$0x11, $A0, $A0, $A0
+ vmovdqa $A0x, $A1x
+1:
+ vzeroupper
+ jmp open_sse_tail_16
+###############################################################################
+open_avx2_192:
+ vmovdqa $A0, $A1
+ vmovdqa $A0, $A2
+ vmovdqa $B0, $B1
+ vmovdqa $B0, $B2
+ vmovdqa $C0, $C1
+ vmovdqa $C0, $C2
+ vpaddd .avx2_inc(%rip), $D0, $D1
+ vmovdqa $D0, $T2
+ vmovdqa $D1, $T3
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+ dec $acc0
+ jne 1b
+ vpaddd $A2, $A0, $A0
+ vpaddd $A2, $A1, $A1
+ vpaddd $B2, $B0, $B0
+ vpaddd $B2, $B1, $B1
+ vpaddd $C2, $C0, $C0
+ vpaddd $C2, $C1, $C1
+ vpaddd $T2, $D0, $D0
+ vpaddd $T3, $D1, $D1
+ vperm2i128 \$0x02, $A0, $B0, $T0
+ # Clamp and store the key
+ vpand .clamp(%rip), $T0, $T0
+ vmovdqa $T0, $r_store
+ # Stream for up to 192 bytes
+ vperm2i128 \$0x13, $A0, $B0, $A0
+ vperm2i128 \$0x13, $C0, $D0, $B0
+ vperm2i128 \$0x02, $A1, $B1, $C0
+ vperm2i128 \$0x02, $C1, $D1, $D0
+ vperm2i128 \$0x13, $A1, $B1, $A1
+ vperm2i128 \$0x13, $C1, $D1, $B1
+open_avx2_short:
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+open_avx2_hash_and_xor_loop:
+ cmp \$32, $inl
+ jb open_avx2_short_tail_32
+ sub \$32, $inl\n";
+ # Load + hash
+ &poly_add("0*8($inp)");
+ &poly_mul();
+ &poly_add("2*8($inp)");
+ &poly_mul(); $code.="
+ # Load + decrypt
+ vpxor ($inp), $A0, $A0
+ vmovdqu $A0, ($oup)
+ lea 1*32($inp), $inp
+ lea 1*32($oup), $oup
+ # Shift stream
+ vmovdqa $B0, $A0
+ vmovdqa $C0, $B0
+ vmovdqa $D0, $C0
+ vmovdqa $A1, $D0
+ vmovdqa $B1, $A1
+ vmovdqa $C1, $B1
+ vmovdqa $D1, $C1
+ vmovdqa $A2, $D1
+ vmovdqa $B2, $A2
+ jmp open_avx2_hash_and_xor_loop
+open_avx2_short_tail_32:
+ cmp \$16, $inl
+ vmovdqa $A0x, $A1x
+ jb 1f
+ sub \$16, $inl\n";
+ &poly_add("0*8($inp)");
+ &poly_mul(); $code.="
+ vpxor ($inp), $A0x, $A3x
+ vmovdqu $A3x, ($oup)
+ lea 1*16($inp), $inp
+ lea 1*16($oup), $oup
+ vextracti128 \$1, $A0, $A1x
+1:
+ vzeroupper
+ jmp open_sse_tail_16
+###############################################################################
+open_avx2_320:
+ vmovdqa $A0, $A1
+ vmovdqa $A0, $A2
+ vmovdqa $B0, $B1
+ vmovdqa $B0, $B2
+ vmovdqa $C0, $C1
+ vmovdqa $C0, $C2
+ vpaddd .avx2_inc(%rip), $D0, $D1
+ vpaddd .avx2_inc(%rip), $D1, $D2
+ vmovdqa $B0, $T1
+ vmovdqa $C0, $T2
+ vmovdqa $D0, $ctr0_store
+ vmovdqa $D1, $ctr1_store
+ vmovdqa $D2, $ctr2_store
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ dec $acc0
+ jne 1b
+ vpaddd .chacha20_consts(%rip), $A0, $A0
+ vpaddd .chacha20_consts(%rip), $A1, $A1
+ vpaddd .chacha20_consts(%rip), $A2, $A2
+ vpaddd $T1, $B0, $B0
+ vpaddd $T1, $B1, $B1
+ vpaddd $T1, $B2, $B2
+ vpaddd $T2, $C0, $C0
+ vpaddd $T2, $C1, $C1
+ vpaddd $T2, $C2, $C2
+ vpaddd $ctr0_store, $D0, $D0
+ vpaddd $ctr1_store, $D1, $D1
+ vpaddd $ctr2_store, $D2, $D2
+ vperm2i128 \$0x02, $A0, $B0, $T0
+ # Clamp and store the key
+ vpand .clamp(%rip), $T0, $T0
+ vmovdqa $T0, $r_store
+ # Stream for up to 320 bytes
+ vperm2i128 \$0x13, $A0, $B0, $A0
+ vperm2i128 \$0x13, $C0, $D0, $B0
+ vperm2i128 \$0x02, $A1, $B1, $C0
+ vperm2i128 \$0x02, $C1, $D1, $D0
+ vperm2i128 \$0x13, $A1, $B1, $A1
+ vperm2i128 \$0x13, $C1, $D1, $B1
+ vperm2i128 \$0x02, $A2, $B2, $C1
+ vperm2i128 \$0x02, $C2, $D2, $D1
+ vperm2i128 \$0x13, $A2, $B2, $A2
+ vperm2i128 \$0x13, $C2, $D2, $B2
+ jmp open_avx2_short
+.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+###############################################################################
+###############################################################################
+.type chacha20_poly1305_seal_avx2,\@function,2
+.align 64
+chacha20_poly1305_seal_avx2:
+ vzeroupper
+ vmovdqa .chacha20_consts(%rip), $A0
+ vbroadcasti128 0*16($keyp), $B0
+ vbroadcasti128 1*16($keyp), $C0
+ vbroadcasti128 2*16($keyp), $D0
+ vpaddd .avx2_init(%rip), $D0, $D0
+ cmp \$6*32, $inl
+ jbe seal_avx2_192
+ cmp \$10*32, $inl
+ jbe seal_avx2_320
+ vmovdqa $A0, $A1
+ vmovdqa $A0, $A2
+ vmovdqa $A0, $A3
+ vmovdqa $B0, $B1
+ vmovdqa $B0, $B2
+ vmovdqa $B0, $B3
+ vmovdqa $B0, $state1_store
+ vmovdqa $C0, $C1
+ vmovdqa $C0, $C2
+ vmovdqa $C0, $C3
+ vmovdqa $C0, $state2_store
+ vmovdqa $D0, $D3
+ vpaddd .avx2_inc(%rip), $D3, $D2
+ vpaddd .avx2_inc(%rip), $D2, $D1
+ vpaddd .avx2_inc(%rip), $D1, $D0
+ vmovdqa $D0, $ctr0_store
+ vmovdqa $D1, $ctr1_store
+ vmovdqa $D2, $ctr2_store
+ vmovdqa $D3, $ctr3_store
+ mov \$10, $acc0
+1: \n";
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ dec $acc0
+ jnz 1b\n";
+ &finalize_state_avx2(4); $code.="
+ vperm2i128 \$0x13, $C3, $D3, $C3
+ vperm2i128 \$0x02, $A3, $B3, $D3
+ vperm2i128 \$0x13, $A3, $B3, $A3
+ vpand .clamp(%rip), $D3, $D3
+ vmovdqa $D3, $r_store
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+ # Safely store 320 bytes (otherwise would handle with optimized call)
+ vpxor 0*32($inp), $A3, $A3
+ vpxor 1*32($inp), $C3, $C3
+ vmovdqu $A3, 0*32($oup)
+ vmovdqu $C3, 1*32($oup)\n";
+ &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
+ &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
+ lea 10*32($inp), $inp
+ sub \$10*32, $inl
+ mov \$10*32, $itr1
+ cmp \$4*32, $inl
+ jbe seal_avx2_hash
+ vpxor 0*32($inp), $A0, $A0
+ vpxor 1*32($inp), $B0, $B0
+ vpxor 2*32($inp), $C0, $C0
+ vpxor 3*32($inp), $D0, $D0
+ vmovdqu $A0, 10*32($oup)
+ vmovdqu $B0, 11*32($oup)
+ vmovdqu $C0, 12*32($oup)
+ vmovdqu $D0, 13*32($oup)
+ lea 4*32($inp), $inp
+ sub \$4*32, $inl
+ mov \$8, $itr1
+ mov \$2, $itr2
+ cmp \$4*32, $inl
+ jbe seal_avx2_tail_128
+ cmp \$8*32, $inl
+ jbe seal_avx2_tail_256
+ cmp \$12*32, $inl
+ jbe seal_avx2_tail_384
+ cmp \$16*32, $inl
+ jbe seal_avx2_tail_512\n";
+ # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
+ &prep_state_avx2(4);
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body;
+ &emit_body(41);
+ @loop_body = split /\n/, $chacha_body; $code.="
+ sub \$16, $oup
+ mov \$9, $itr1
+ jmp 4f
+1: \n";
+ &prep_state_avx2(4); $code.="
+ mov \$10, $itr1
+2: \n";
+ &poly_add("0*8($oup)");
+ &emit_body(10);
+ &poly_stage1_mulx();
+ &emit_body(9);
+ &poly_stage2_mulx();
+ &emit_body(12);
+ &poly_stage3_mulx();
+ &emit_body(10);
+ &poly_reduce_stage(); $code.="
+4: \n";
+ &emit_body(9);
+ &poly_add("2*8($oup)");
+ &emit_body(8);
+ &poly_stage1_mulx();
+ &emit_body(18);
+ &poly_stage2_mulx();
+ &emit_body(18);
+ &poly_stage3_mulx();
+ &emit_body(9);
+ &poly_reduce_stage();
+ &emit_body(8);
+ &poly_add("4*8($oup)"); $code.="
+ lea 6*8($oup), $oup\n";
+ &emit_body(18);
+ &poly_stage1_mulx();
+ &emit_body(8);
+ &poly_stage2_mulx();
+ &emit_body(8);
+ &poly_stage3_mulx();
+ &emit_body(18);
+ &poly_reduce_stage();
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ dec $itr1
+ jne 2b\n";
+ &finalize_state_avx2(4); $code.="
+ lea 4*8($oup), $oup
+ vmovdqa $A0, $tmp_store\n";
+ &poly_add("-4*8($oup)");
+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+ vmovdqa $tmp_store, $A0\n";
+ &poly_mul();
+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+ &poly_add("-2*8($oup)");
+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+ &poly_mul();
+ &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
+ lea 16*32($inp), $inp
+ sub \$16*32, $inl
+ cmp \$16*32, $inl
+ jg 1b\n";
+ &poly_add("0*8($oup)");
+ &poly_mul();
+ &poly_add("2*8($oup)");
+ &poly_mul(); $code.="
+ lea 4*8($oup), $oup
+ mov \$10, $itr1
+ xor $itr2, $itr2
+ cmp \$4*32, $inl
+ ja 3f
+###############################################################################
+seal_avx2_tail_128:\n";
+ &prep_state_avx2(1); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 2*8($oup), $oup
+2: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &poly_add("0*8($oup)");
+ &poly_mul();
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &poly_add("2*8($oup)");
+ &poly_mul(); $code.="
+ lea 4*8($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state_avx2(1);
+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+ jmp seal_avx2_short_loop
+3:
+ cmp \$8*32, $inl
+ ja 3f
+###############################################################################
+seal_avx2_tail_256:\n";
+ &prep_state_avx2(2); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 2*8($oup), $oup
+2: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &poly_add("0*8($oup)");
+ &poly_mul();
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+ &poly_add("2*8($oup)");
+ &poly_mul(); $code.="
+ lea 4*8($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state_avx2(2);
+ &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+ mov \$4*32, $itr1
+ lea 4*32($inp), $inp
+ sub \$4*32, $inl
+ jmp seal_avx2_hash
+3:
+ cmp \$12*32, $inl
+ ja seal_avx2_tail_512
+###############################################################################
+seal_avx2_tail_384:\n";
+ &prep_state_avx2(3); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ lea 2*8($oup), $oup
+2: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &poly_add("0*8($oup)");
+ &poly_mul();
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &poly_add("2*8($oup)");
+ &poly_mul();
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ lea 4*8($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state_avx2(3);
+ &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
+ &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+ mov \$8*32, $itr1
+ lea 8*32($inp), $inp
+ sub \$8*32, $inl
+ jmp seal_avx2_hash
+###############################################################################
+seal_avx2_tail_512:\n";
+ &prep_state_avx2(4); $code.="
+1: \n";
+ &poly_add("0($oup)");
+ &poly_mul_mulx(); $code.="
+ lea 2*8($oup), $oup
+2: \n";
+ &emit_body(20);
+ &poly_add("0*8($oup)");
+ &emit_body(20);
+ &poly_stage1_mulx();
+ &emit_body(20);
+ &poly_stage2_mulx();
+ &emit_body(20);
+ &poly_stage3_mulx();
+ &emit_body(20);
+ &poly_reduce_stage();
+ &emit_body(20);
+ &poly_add("2*8($oup)");
+ &emit_body(20);
+ &poly_stage1_mulx();
+ &emit_body(20);
+ &poly_stage2_mulx();
+ &emit_body(20);
+ &poly_stage3_mulx();
+ &emit_body(20);
+ &poly_reduce_stage();
+ foreach $l (@loop_body) {$code.=$l."\n";}
+ @loop_body = split /\n/, $chacha_body; $code.="
+ lea 4*8($oup), $oup
+ dec $itr1
+ jg 1b
+ dec $itr2
+ jge 2b\n";
+ &finalize_state_avx2(4); $code.="
+ vmovdqa $A0, $tmp_store\n";
+ &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+ vmovdqa $tmp_store, $A0\n";
+ &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+ &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+ &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+ mov \$12*32, $itr1
+ lea 12*32($inp), $inp
+ sub \$12*32, $inl
+ jmp seal_avx2_hash
+################################################################################
+seal_avx2_320:
+ vmovdqa $A0, $A1
+ vmovdqa $A0, $A2
+ vmovdqa $B0, $B1
+ vmovdqa $B0, $B2
+ vmovdqa $C0, $C1
+ vmovdqa $C0, $C2
+ vpaddd .avx2_inc(%rip), $D0, $D1
+ vpaddd .avx2_inc(%rip), $D1, $D2
+ vmovdqa $B0, $T1
+ vmovdqa $C0, $T2
+ vmovdqa $D0, $ctr0_store
+ vmovdqa $D1, $ctr1_store
+ vmovdqa $D2, $ctr2_store
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+ &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+ dec $acc0
+ jne 1b
+ vpaddd .chacha20_consts(%rip), $A0, $A0
+ vpaddd .chacha20_consts(%rip), $A1, $A1
+ vpaddd .chacha20_consts(%rip), $A2, $A2
+ vpaddd $T1, $B0, $B0
+ vpaddd $T1, $B1, $B1
+ vpaddd $T1, $B2, $B2
+ vpaddd $T2, $C0, $C0
+ vpaddd $T2, $C1, $C1
+ vpaddd $T2, $C2, $C2
+ vpaddd $ctr0_store, $D0, $D0
+ vpaddd $ctr1_store, $D1, $D1
+ vpaddd $ctr2_store, $D2, $D2
+ vperm2i128 \$0x02, $A0, $B0, $T0
+ # Clamp and store the key
+ vpand .clamp(%rip), $T0, $T0
+ vmovdqa $T0, $r_store
+ # Stream for up to 320 bytes
+ vperm2i128 \$0x13, $A0, $B0, $A0
+ vperm2i128 \$0x13, $C0, $D0, $B0
+ vperm2i128 \$0x02, $A1, $B1, $C0
+ vperm2i128 \$0x02, $C1, $D1, $D0
+ vperm2i128 \$0x13, $A1, $B1, $A1
+ vperm2i128 \$0x13, $C1, $D1, $B1
+ vperm2i128 \$0x02, $A2, $B2, $C1
+ vperm2i128 \$0x02, $C2, $D2, $D1
+ vperm2i128 \$0x13, $A2, $B2, $A2
+ vperm2i128 \$0x13, $C2, $D2, $B2
+ jmp seal_avx2_short
+################################################################################
+seal_avx2_192:
+ vmovdqa $A0, $A1
+ vmovdqa $A0, $A2
+ vmovdqa $B0, $B1
+ vmovdqa $B0, $B2
+ vmovdqa $C0, $C1
+ vmovdqa $C0, $C2
+ vpaddd .avx2_inc(%rip), $D0, $D1
+ vmovdqa $D0, $T2
+ vmovdqa $D1, $T3
+ mov \$10, $acc0
+1: \n";
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+ &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+ &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+ dec $acc0
+ jne 1b
+ vpaddd $A2, $A0, $A0
+ vpaddd $A2, $A1, $A1
+ vpaddd $B2, $B0, $B0
+ vpaddd $B2, $B1, $B1
+ vpaddd $C2, $C0, $C0
+ vpaddd $C2, $C1, $C1
+ vpaddd $T2, $D0, $D0
+ vpaddd $T3, $D1, $D1
+ vperm2i128 \$0x02, $A0, $B0, $T0
+ # Clamp and store the key
+ vpand .clamp(%rip), $T0, $T0
+ vmovdqa $T0, $r_store
+ # Stream for up to 192 bytes
+ vperm2i128 \$0x13, $A0, $B0, $A0
+ vperm2i128 \$0x13, $C0, $D0, $B0
+ vperm2i128 \$0x02, $A1, $B1, $C0
+ vperm2i128 \$0x02, $C1, $D1, $D0
+ vperm2i128 \$0x13, $A1, $B1, $A1
+ vperm2i128 \$0x13, $C1, $D1, $B1
+seal_avx2_short:
+ mov %r8, $itr2
+ call poly_hash_ad_internal
+ xor $itr1, $itr1
+seal_avx2_hash:
+ cmp \$16, $itr1
+ jb seal_avx2_short_loop\n";
+ &poly_add("0($oup)");
+ &poly_mul(); $code.="
+ sub \$16, $itr1
+ add \$16, $oup
+ jmp seal_avx2_hash
+seal_avx2_short_loop:
+ cmp \$32, $inl
+ jb seal_avx2_short_tail
+ sub \$32, $inl
+ # Encrypt
+ vpxor ($inp), $A0, $A0
+ vmovdqu $A0, ($oup)
+ lea 1*32($inp), $inp
+ # Load + hash\n";
+ &poly_add("0*8($oup)");
+ &poly_mul();
+ &poly_add("2*8($oup)");
+ &poly_mul(); $code.="
+ lea 1*32($oup), $oup
+ # Shift stream
+ vmovdqa $B0, $A0
+ vmovdqa $C0, $B0
+ vmovdqa $D0, $C0
+ vmovdqa $A1, $D0
+ vmovdqa $B1, $A1
+ vmovdqa $C1, $B1
+ vmovdqa $D1, $C1
+ vmovdqa $A2, $D1
+ vmovdqa $B2, $A2
+ jmp seal_avx2_short_loop
+seal_avx2_short_tail:
+ cmp \$16, $inl
+ jb 1f
+ sub \$16, $inl
+ vpxor ($inp), $A0x, $A3x
+ vmovdqu $A3x, ($oup)
+ lea 1*16($inp), $inp\n";
+ &poly_add("0*8($oup)");
+ &poly_mul(); $code.="
+ lea 1*16($oup), $oup
+ vextracti128 \$1, $A0, $A0x
+1:
+ vzeroupper
+ jmp seal_sse_tail_16
+.cfi_endproc
+";
+}
+
+if (!$win64) {
+ $code =~ s/\`([^\`]*)\`/eval $1/gem;
+ print $code;
+} else {
+ print <<___;
+.globl dummy_chacha20_poly1305_asm
+.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
+dummy_chacha20_poly1305_asm:
+ ret
+___
+}
+
+close STDOUT;
diff --git a/src/crypto/cipher/e_chacha20poly1305.c b/src/crypto/cipher/e_chacha20poly1305.c
index ed0d74c..34d094b 100644
--- a/src/crypto/cipher/e_chacha20poly1305.c
+++ b/src/crypto/cipher/e_chacha20poly1305.c
@@ -33,6 +33,42 @@
unsigned char tag_len;
};
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
+ !defined(OPENSSL_WINDOWS)
+static const int kHaveAsm = 1;
+// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It
+// decrypts |plaintext_len| bytes from |ciphertext| and writes them to
+// |out_plaintext|. On entry, |aead_data| must contain the final 48 bytes of
+// the initial ChaCha20 block, i.e. the key, followed by four zeros, followed
+// by the nonce. On exit, it will contain the calculated tag value, which the
+// caller must check.
+void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext,
+ size_t plaintext_len, const uint8_t *ad,
+ size_t ad_len, uint8_t *aead_data);
+
+// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It
+// encrypts |plaintext_len| bytes from |plaintext| and writes them to
+// |out_ciphertext|. On entry, |aead_data| must contain the final 48 bytes of
+// the initial ChaCha20 block, i.e. the key, followed by four zeros, followed
+// by the nonce. On exit, it will contain the calculated tag value, which the
+// caller must append to the ciphertext.
+void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
+ size_t plaintext_len, const uint8_t *ad,
+ size_t ad_len, uint8_t *aead_data);
+#else
+static const int kHaveAsm = 0;
+
+static void chacha20_poly1305_open(uint8_t *out_plaintext,
+ const uint8_t *ciphertext,
+ size_t plaintext_len, const uint8_t *ad,
+ size_t ad_len, uint8_t *aead_data) {}
+
+static void chacha20_poly1305_seal(uint8_t *out_ciphertext,
+ const uint8_t *plaintext,
+ size_t plaintext_len, const uint8_t *ad,
+ size_t ad_len, uint8_t *aead_data) {}
+#endif
+
static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
size_t key_len, size_t tag_len) {
struct aead_chacha20_poly1305_ctx *c20_ctx;
@@ -70,9 +106,8 @@
static void poly1305_update_length(poly1305_state *poly1305, size_t data_len) {
uint8_t length_bytes[8];
- unsigned i;
- for (i = 0; i < sizeof(length_bytes); i++) {
+ for (unsigned i = 0; i < sizeof(length_bytes); i++) {
length_bytes[i] = data_len;
data_len >>= 8;
}
@@ -80,37 +115,49 @@
CRYPTO_poly1305_update(poly1305, length_bytes, sizeof(length_bytes));
}
-typedef void (*aead_poly1305_update)(poly1305_state *ctx, const uint8_t *ad,
- size_t ad_len, const uint8_t *ciphertext,
- size_t ciphertext_len);
+static void poly1305_update_padded_16(poly1305_state *poly1305,
+ const uint8_t *data, size_t data_len) {
+ static const uint8_t padding[16] = { 0 }; /* Padding is all zeros. */
-/* aead_poly1305 fills |tag| with the authentication tag for the given
- * inputs, using |update| to control the order and format that the inputs are
- * signed/authenticated. */
-static void aead_poly1305(aead_poly1305_update update,
- uint8_t tag[POLY1305_TAG_LEN],
- const struct aead_chacha20_poly1305_ctx *c20_ctx,
- const uint8_t nonce[12], const uint8_t *ad,
- size_t ad_len, const uint8_t *ciphertext,
- size_t ciphertext_len) {
+ CRYPTO_poly1305_update(poly1305, data, data_len);
+ if (data_len % 16 != 0) {
+ CRYPTO_poly1305_update(poly1305, padding,
+ sizeof(padding) - (data_len % 16));
+ }
+}
+
+/* calc_tag fills |tag| with the authentication tag for the given inputs. */
+static void calc_tag(uint8_t tag[POLY1305_TAG_LEN],
+ const struct aead_chacha20_poly1305_ctx *c20_ctx,
+ const uint8_t nonce[12], const uint8_t *ad, size_t ad_len,
+ const uint8_t *ciphertext, size_t ciphertext_len) {
alignas(16) uint8_t poly1305_key[32];
OPENSSL_memset(poly1305_key, 0, sizeof(poly1305_key));
CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key),
c20_ctx->key, nonce, 0);
+
poly1305_state ctx;
CRYPTO_poly1305_init(&ctx, poly1305_key);
- update(&ctx, ad, ad_len, ciphertext, ciphertext_len);
+ poly1305_update_padded_16(&ctx, ad, ad_len);
+ poly1305_update_padded_16(&ctx, ciphertext, ciphertext_len);
+ poly1305_update_length(&ctx, ad_len);
+ poly1305_update_length(&ctx, ciphertext_len);
CRYPTO_poly1305_finish(&ctx, tag);
}
-static int seal_impl(aead_poly1305_update poly1305_update,
- const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len,
- size_t max_out_len, const uint8_t nonce[12],
- const uint8_t *in, size_t in_len, const uint8_t *ad,
- size_t ad_len) {
+static int aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, uint8_t *out,
+ size_t *out_len, size_t max_out_len,
+ const uint8_t *nonce, size_t nonce_len,
+ const uint8_t *in, size_t in_len,
+ const uint8_t *ad, size_t ad_len) {
const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state;
const uint64_t in_len_64 = in_len;
+ if (nonce_len != 12) {
+ OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
+ return 0;
+ }
+
/* |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow
* individual operations that work on more than 256GB at a time.
* |in_len_64| is needed because, on 32-bit platforms, size_t is only
@@ -132,25 +179,37 @@
return 0;
}
- CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
+ alignas(16) uint8_t tag[48];
- alignas(16) uint8_t tag[POLY1305_TAG_LEN];
- aead_poly1305(poly1305_update, tag, c20_ctx, nonce, ad, ad_len, out, in_len);
+ if (kHaveAsm) {
+ OPENSSL_memcpy(tag, c20_ctx->key, 32);
+ OPENSSL_memset(tag + 32, 0, 4);
+ OPENSSL_memcpy(tag + 32 + 4, nonce, 12);
+ chacha20_poly1305_seal(out, in, in_len, ad, ad_len, tag);
+ } else {
+ CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
+ calc_tag(tag, c20_ctx, nonce, ad, ad_len, out, in_len);
+ }
OPENSSL_memcpy(out + in_len, tag, c20_ctx->tag_len);
*out_len = in_len + c20_ctx->tag_len;
return 1;
}
-static int open_impl(aead_poly1305_update poly1305_update,
- const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len,
- size_t max_out_len, const uint8_t nonce[12],
- const uint8_t *in, size_t in_len, const uint8_t *ad,
- size_t ad_len) {
+static int aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, uint8_t *out,
+ size_t *out_len, size_t max_out_len,
+ const uint8_t *nonce, size_t nonce_len,
+ const uint8_t *in, size_t in_len,
+ const uint8_t *ad, size_t ad_len) {
const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state;
size_t plaintext_len;
const uint64_t in_len_64 = in_len;
+ if (nonce_len != 12) {
+ OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
+ return 0;
+ }
+
if (in_len < c20_ctx->tag_len) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
@@ -168,64 +227,27 @@
}
plaintext_len = in_len - c20_ctx->tag_len;
- alignas(16) uint8_t tag[POLY1305_TAG_LEN];
- aead_poly1305(poly1305_update, tag, c20_ctx, nonce, ad, ad_len, in,
- plaintext_len);
+ alignas(16) uint8_t tag[48];
+
+ if (kHaveAsm) {
+ OPENSSL_memcpy(tag, c20_ctx->key, 32);
+ OPENSSL_memset(tag + 32, 0, 4);
+ OPENSSL_memcpy(tag + 32 + 4, nonce, 12);
+ chacha20_poly1305_open(out, in, plaintext_len, ad, ad_len, tag);
+ } else {
+ calc_tag(tag, c20_ctx, nonce, ad, ad_len, in, plaintext_len);
+ CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
+ }
+
if (CRYPTO_memcmp(tag, in + plaintext_len, c20_ctx->tag_len) != 0) {
OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
return 0;
}
- CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
*out_len = plaintext_len;
return 1;
}
-static void poly1305_update_padded_16(poly1305_state *poly1305,
- const uint8_t *data, size_t data_len) {
- static const uint8_t padding[16] = { 0 }; /* Padding is all zeros. */
-
- CRYPTO_poly1305_update(poly1305, data, data_len);
- if (data_len % 16 != 0) {
- CRYPTO_poly1305_update(poly1305, padding, sizeof(padding) - (data_len % 16));
- }
-}
-
-static void poly1305_update(poly1305_state *ctx, const uint8_t *ad,
- size_t ad_len, const uint8_t *ciphertext,
- size_t ciphertext_len) {
- poly1305_update_padded_16(ctx, ad, ad_len);
- poly1305_update_padded_16(ctx, ciphertext, ciphertext_len);
- poly1305_update_length(ctx, ad_len);
- poly1305_update_length(ctx, ciphertext_len);
-}
-
-static int aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, uint8_t *out,
- size_t *out_len, size_t max_out_len,
- const uint8_t *nonce, size_t nonce_len,
- const uint8_t *in, size_t in_len,
- const uint8_t *ad, size_t ad_len) {
- if (nonce_len != 12) {
- OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
- return 0;
- }
- return seal_impl(poly1305_update, ctx, out, out_len, max_out_len, nonce, in,
- in_len, ad, ad_len);
-}
-
-static int aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, uint8_t *out,
- size_t *out_len, size_t max_out_len,
- const uint8_t *nonce, size_t nonce_len,
- const uint8_t *in, size_t in_len,
- const uint8_t *ad, size_t ad_len) {
- if (nonce_len != 12) {
- OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
- return 0;
- }
- return open_impl(poly1305_update, ctx, out, out_len, max_out_len, nonce, in,
- in_len, ad, ad_len);
-}
-
static const EVP_AEAD aead_chacha20_poly1305 = {
32, /* key len */
12, /* nonce len */
@@ -242,59 +264,3 @@
const EVP_AEAD *EVP_aead_chacha20_poly1305(void) {
return &aead_chacha20_poly1305;
}
-
-static void poly1305_update_old(poly1305_state *ctx, const uint8_t *ad,
- size_t ad_len, const uint8_t *ciphertext,
- size_t ciphertext_len) {
- CRYPTO_poly1305_update(ctx, ad, ad_len);
- poly1305_update_length(ctx, ad_len);
- CRYPTO_poly1305_update(ctx, ciphertext, ciphertext_len);
- poly1305_update_length(ctx, ciphertext_len);
-}
-
-static int aead_chacha20_poly1305_old_seal(
- const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, size_t max_out_len,
- const uint8_t *nonce, size_t nonce_len, const uint8_t *in, size_t in_len,
- const uint8_t *ad, size_t ad_len) {
- if (nonce_len != 8) {
- OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
- return 0;
- }
- uint8_t nonce_96[12];
- OPENSSL_memset(nonce_96, 0, 4);
- OPENSSL_memcpy(nonce_96 + 4, nonce, 8);
- return seal_impl(poly1305_update_old, ctx, out, out_len, max_out_len,
- nonce_96, in, in_len, ad, ad_len);
-}
-
-static int aead_chacha20_poly1305_old_open(
- const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, size_t max_out_len,
- const uint8_t *nonce, size_t nonce_len, const uint8_t *in, size_t in_len,
- const uint8_t *ad, size_t ad_len) {
- if (nonce_len != 8) {
- OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
- return 0;
- }
- uint8_t nonce_96[12];
- OPENSSL_memset(nonce_96, 0, 4);
- OPENSSL_memcpy(nonce_96 + 4, nonce, 8);
- return open_impl(poly1305_update_old, ctx, out, out_len, max_out_len,
- nonce_96, in, in_len, ad, ad_len);
-}
-
-static const EVP_AEAD aead_chacha20_poly1305_old = {
- 32, /* key len */
- 8, /* nonce len */
- POLY1305_TAG_LEN, /* overhead */
- POLY1305_TAG_LEN, /* max tag length */
- aead_chacha20_poly1305_init,
- NULL, /* init_with_direction */
- aead_chacha20_poly1305_cleanup,
- aead_chacha20_poly1305_old_seal,
- aead_chacha20_poly1305_old_open,
- NULL, /* get_iv */
-};
-
-const EVP_AEAD *EVP_aead_chacha20_poly1305_old(void) {
- return &aead_chacha20_poly1305_old;
-}
diff --git a/src/crypto/cipher/test/chacha20_poly1305_tests.txt b/src/crypto/cipher/test/chacha20_poly1305_tests.txt
index 103c196..018eb56 100644
--- a/src/crypto/cipher/test/chacha20_poly1305_tests.txt
+++ b/src/crypto/cipher/test/chacha20_poly1305_tests.txt
@@ -47,9 +47,6 @@
CT: e275aeb341e1fc9a70c4fd4496fc7cdb
TAG: 41acd0560ea6843d3e5d4e5babf6e946
-# Test vectors from chacha20_poly1305_old_tests.txt, modified for the RFC 7539
-# AEAD construction.
-
KEY: 9a97f65b9b4c721b960a672145fca8d4e32e67f9111ea979ce9c4826806aeee6
NONCE: 000000003de9c0da2bd7f91e
IN: ""
diff --git a/src/crypto/dh/CMakeLists.txt b/src/crypto/dh/CMakeLists.txt
index f1e8616..83ae6d4 100644
--- a/src/crypto/dh/CMakeLists.txt
+++ b/src/crypto/dh/CMakeLists.txt
@@ -10,14 +10,3 @@
check.c
dh_asn1.c
)
-
-add_executable(
- dh_test
-
- dh_test.cc
-
- $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(dh_test crypto)
-add_dependencies(all_tests dh_test)
diff --git a/src/crypto/dh/dh_test.cc b/src/crypto/dh/dh_test.cc
index 8165c1a..9cde679 100644
--- a/src/crypto/dh/dh_test.cc
+++ b/src/crypto/dh/dh_test.cc
@@ -61,6 +61,8 @@
#include <vector>
+#include <gtest/gtest.h>
+
#include <openssl/bn.h>
#include <openssl/bytestring.h>
#include <openssl/crypto.h>
@@ -77,20 +79,16 @@
static bool TestASN1();
static bool TestRFC3526();
-int main() {
- CRYPTO_library_init();
-
+// TODO(davidben): Convert this file to GTest properly.
+TEST(DHTest, AllTests) {
if (!RunBasicTests() ||
!RunRFC5114Tests() ||
!TestBadY() ||
!TestASN1() ||
!TestRFC3526()) {
ERR_print_errors_fp(stderr);
- return 1;
+ ADD_FAILURE() << "Tests failed.";
}
-
- printf("PASS\n");
- return 0;
}
static int GenerateCallback(int p, int n, BN_GENCB *arg) {
diff --git a/src/crypto/dsa/CMakeLists.txt b/src/crypto/dsa/CMakeLists.txt
index 4d66136..d3c12f5 100644
--- a/src/crypto/dsa/CMakeLists.txt
+++ b/src/crypto/dsa/CMakeLists.txt
@@ -8,14 +8,3 @@
dsa.c
dsa_asn1.c
)
-
-add_executable(
- dsa_test
-
- dsa_test.cc
-
- $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(dsa_test crypto)
-add_dependencies(all_tests dsa_test)
diff --git a/src/crypto/dsa/dsa_test.cc b/src/crypto/dsa/dsa_test.cc
index 5fee6aa..d2cd33e 100644
--- a/src/crypto/dsa/dsa_test.cc
+++ b/src/crypto/dsa/dsa_test.cc
@@ -62,6 +62,8 @@
#include <stdio.h>
#include <string.h>
+#include <gtest/gtest.h>
+
#include <openssl/bn.h>
#include <openssl/crypto.h>
#include <openssl/err.h>
@@ -302,9 +304,8 @@
return true;
}
-int main(int argc, char **argv) {
- CRYPTO_library_init();
-
+// TODO(davidben): Convert this file to GTest properly.
+TEST(DSATest, AllTests) {
if (!TestGenerate(stdout) ||
!TestVerify(fips_sig, sizeof(fips_sig), 1) ||
!TestVerify(fips_sig_negative, sizeof(fips_sig_negative), -1) ||
@@ -312,9 +313,6 @@
!TestVerify(fips_sig_bad_length, sizeof(fips_sig_bad_length), -1) ||
!TestVerify(fips_sig_bad_r, sizeof(fips_sig_bad_r), 0)) {
ERR_print_errors_fp(stderr);
- return 1;
+ ADD_FAILURE() << "Tests failed";
}
-
- printf("PASS\n");
- return 0;
}
diff --git a/src/crypto/poly1305/asm/poly1305-armv4.pl b/src/crypto/poly1305/asm/poly1305-armv4.pl
deleted file mode 100755
index 8d35e28..0000000
--- a/src/crypto/poly1305/asm/poly1305-armv4.pl
+++ /dev/null
@@ -1,1216 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# IALU(*)/gcc-4.4 NEON
-#
-# ARM11xx(ARMv6) 7.78/+100% -
-# Cortex-A5 6.30/+130% 2.96
-# Cortex-A8 6.25/+115% 2.36
-# Cortex-A9 5.10/+95% 2.55
-# Cortex-A15 3.79/+85% 1.25(**)
-# Snapdragon S4 5.70/+100% 1.48(**)
-#
-# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**) these are trade-off results, they can be improved by ~8% but at
-# the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-# to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
-
- open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
- open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#include <openssl/arm_arch.h>
-
-.text
-#if defined(__thumb2__)
-.syntax unified
-.thumb
-#else
-.code 32
-#endif
-
-.globl poly1305_emit
-.globl poly1305_blocks
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
-.Lpoly1305_init:
- stmdb sp!,{r4-r11}
-
- eor r3,r3,r3
- cmp $inp,#0
- str r3,[$ctx,#0] @ zero hash value
- str r3,[$ctx,#4]
- str r3,[$ctx,#8]
- str r3,[$ctx,#12]
- str r3,[$ctx,#16]
- str r3,[$ctx,#36] @ is_base2_26
- add $ctx,$ctx,#20
-
-#ifdef __thumb2__
- it eq
-#endif
- moveq r0,#0
- beq .Lno_key
-
-#if __ARM_MAX_ARCH__>=7
- adr r11,.Lpoly1305_init
- ldr r12,.LOPENSSL_armcap
-#endif
- ldrb r4,[$inp,#0]
- mov r10,#0x0fffffff
- ldrb r5,[$inp,#1]
- and r3,r10,#-4 @ 0x0ffffffc
- ldrb r6,[$inp,#2]
- ldrb r7,[$inp,#3]
- orr r4,r4,r5,lsl#8
- ldrb r5,[$inp,#4]
- orr r4,r4,r6,lsl#16
- ldrb r6,[$inp,#5]
- orr r4,r4,r7,lsl#24
- ldrb r7,[$inp,#6]
- and r4,r4,r10
-
-#if __ARM_MAX_ARCH__>=7
- ldr r12,[r11,r12] @ OPENSSL_armcap_P
-# ifdef __APPLE__
- ldr r12,[r12]
-# endif
-#endif
- ldrb r8,[$inp,#7]
- orr r5,r5,r6,lsl#8
- ldrb r6,[$inp,#8]
- orr r5,r5,r7,lsl#16
- ldrb r7,[$inp,#9]
- orr r5,r5,r8,lsl#24
- ldrb r8,[$inp,#10]
- and r5,r5,r3
-
-#if __ARM_MAX_ARCH__>=7
- tst r12,#ARMV7_NEON @ check for NEON
-# ifdef __APPLE__
- adr r9,poly1305_blocks_neon
- adr r11,poly1305_blocks
-# ifdef __thumb2__
- it ne
-# endif
- movne r11,r9
- adr r12,poly1305_emit
- adr r10,poly1305_emit_neon
-# ifdef __thumb2__
- it ne
-# endif
- movne r12,r10
-# else
-# ifdef __thumb2__
- itete eq
-# endif
- addeq r12,r11,#(poly1305_emit-.Lpoly1305_init)
- addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
- addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init)
- addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef __thumb2__
- orr r12,r12,#1 @ thumb-ify address
- orr r11,r11,#1
-# endif
-#endif
- ldrb r9,[$inp,#11]
- orr r6,r6,r7,lsl#8
- ldrb r7,[$inp,#12]
- orr r6,r6,r8,lsl#16
- ldrb r8,[$inp,#13]
- orr r6,r6,r9,lsl#24
- ldrb r9,[$inp,#14]
- and r6,r6,r3
-
- ldrb r10,[$inp,#15]
- orr r7,r7,r8,lsl#8
- str r4,[$ctx,#0]
- orr r7,r7,r9,lsl#16
- str r5,[$ctx,#4]
- orr r7,r7,r10,lsl#24
- str r6,[$ctx,#8]
- and r7,r7,r3
- str r7,[$ctx,#12]
-#if __ARM_MAX_ARCH__>=7
- stmia r2,{r11,r12} @ fill functions table
- mov r0,#1
-#else
- mov r0,#0
-#endif
-.Lno_key:
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
- stmdb sp!,{r3-r11,lr}
-
- ands $len,$len,#-16
- beq .Lno_data
-
- cmp $padbit,#0
- add $len,$len,$inp @ end pointer
- sub sp,sp,#32
-
- ldmia $ctx,{$h0-$r3} @ load context
-
- str $ctx,[sp,#12] @ offload stuff
- mov lr,$inp
- str $len,[sp,#16]
- str $r1,[sp,#20]
- str $r2,[sp,#24]
- str $r3,[sp,#28]
- b .Loop
-
-.Loop:
-#if __ARM_ARCH__<7
- ldrb r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ 1<<128
- ldrb r1,[lr,#-15]
- ldrb r2,[lr,#-14]
- ldrb r3,[lr,#-13]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-12]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-11]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-10]
- adds $h0,$h0,r3 @ accumulate input
-
- ldrb r3,[lr,#-9]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-8]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-7]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-6]
- adcs $h1,$h1,r3
-
- ldrb r3,[lr,#-5]
- orr r1,r0,r1,lsl#8
- ldrb r0,[lr,#-4]
- orr r2,r1,r2,lsl#16
- ldrb r1,[lr,#-3]
- orr r3,r2,r3,lsl#24
- ldrb r2,[lr,#-2]
- adcs $h2,$h2,r3
-
- ldrb r3,[lr,#-1]
- orr r1,r0,r1,lsl#8
- str lr,[sp,#8] @ offload input pointer
- orr r2,r1,r2,lsl#16
- add $s1,$r1,$r1,lsr#2
- orr r3,r2,r3,lsl#24
-#else
- ldr r0,[lr],#16 @ load input
-# ifdef __thumb2__
- it hi
-# endif
- addhi $h4,$h4,#1 @ padbit
- ldr r1,[lr,#-12]
- ldr r2,[lr,#-8]
- ldr r3,[lr,#-4]
-# ifdef __ARMEB__
- rev r0,r0
- rev r1,r1
- rev r2,r2
- rev r3,r3
-# endif
- adds $h0,$h0,r0 @ accumulate input
- str lr,[sp,#8] @ offload input pointer
- adcs $h1,$h1,r1
- add $s1,$r1,$r1,lsr#2
- adcs $h2,$h2,r2
-#endif
- add $s2,$r2,$r2,lsr#2
- adcs $h3,$h3,r3
- add $s3,$r3,$r3,lsr#2
-
- umull r2,r3,$h1,$r0
- adc $h4,$h4,#0
- umull r0,r1,$h0,$r0
- umlal r2,r3,$h4,$s1
- umlal r0,r1,$h3,$s1
- ldr $r1,[sp,#20] @ reload $r1
- umlal r2,r3,$h2,$s3
- umlal r0,r1,$h1,$s3
- umlal r2,r3,$h3,$s2
- umlal r0,r1,$h2,$s2
- umlal r2,r3,$h0,$r1
- str r0,[sp,#0] @ future $h0
- mul r0,$s2,$h4
- ldr $r2,[sp,#24] @ reload $r2
- adds r2,r2,r1 @ d1+=d0>>32
- eor r1,r1,r1
- adc lr,r3,#0 @ future $h2
- str r2,[sp,#4] @ future $h1
-
- mul r2,$s3,$h4
- eor r3,r3,r3
- umlal r0,r1,$h3,$s3
- ldr $r3,[sp,#28] @ reload $r3
- umlal r2,r3,$h3,$r0
- umlal r0,r1,$h2,$r0
- umlal r2,r3,$h2,$r1
- umlal r0,r1,$h1,$r1
- umlal r2,r3,$h1,$r2
- umlal r0,r1,$h0,$r2
- umlal r2,r3,$h0,$r3
- ldr $h0,[sp,#0]
- mul $h4,$r0,$h4
- ldr $h1,[sp,#4]
-
- adds $h2,lr,r0 @ d2+=d1>>32
- ldr lr,[sp,#8] @ reload input pointer
- adc r1,r1,#0
- adds $h3,r2,r1 @ d3+=d2>>32
- ldr r0,[sp,#16] @ reload end pointer
- adc r3,r3,#0
- add $h4,$h4,r3 @ h4+=d3>>32
-
- and r1,$h4,#-4
- and $h4,$h4,#3
- add r1,r1,r1,lsr#2 @ *=5
- adds $h0,$h0,r1
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adc $h3,$h3,#0
-
- cmp r0,lr @ done yet?
- bhi .Loop
-
- ldr $ctx,[sp,#12]
- add sp,sp,#32
- stmia $ctx,{$h0-$h4} @ store the result
-
-.Lno_data:
-#if __ARM_ARCH__>=5
- ldmia sp!,{r3-r11,pc}
-#else
- ldmia sp!,{r3-r11,lr}
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$h4;
-
-$code.=<<___;
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
- stmdb sp!,{r4-r11}
-.Lpoly1305_emit_enter:
-
- ldmia $ctx,{$h0-$h4}
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
-#ifdef __thumb2__
- it ne
-#endif
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
-#ifdef __thumb2__
- it ne
-#endif
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0]
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-#else
- strb $h0,[$mac,#0]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#4]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#8]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#12]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#1]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#5]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#9]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#13]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#2]
- mov $h0,$h0,lsr#8
- strb $h1,[$mac,#6]
- mov $h1,$h1,lsr#8
- strb $h2,[$mac,#10]
- mov $h2,$h2,lsr#8
- strb $h3,[$mac,#14]
- mov $h3,$h3,lsr#8
-
- strb $h0,[$mac,#3]
- strb $h1,[$mac,#7]
- strb $h2,[$mac,#11]
- strb $h3,[$mac,#15]
-#endif
- ldmia sp!,{r4-r11}
-#if __ARM_ARCH__>=5
- ret @ bx lr
-#else
- tst lr,#1
- moveq pc,lr @ be binary compatible with V4, yet
- bx lr @ interoperable with Thumb ISA:-)
-#endif
-.size poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if __ARM_MAX_ARCH__>=7
-.fpu neon
-
-.type poly1305_init_neon,%function
-.align 5
-poly1305_init_neon:
- ldr r4,[$ctx,#20] @ load key base 2^32
- ldr r5,[$ctx,#24]
- ldr r6,[$ctx,#28]
- ldr r7,[$ctx,#32]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- and r3,r3,#0x03ffffff
- and r4,r4,#0x03ffffff
- and r5,r5,#0x03ffffff
-
- vdup.32 $R0,r2 @ r^1 in both lanes
- add r2,r3,r3,lsl#2 @ *5
- vdup.32 $R1,r3
- add r3,r4,r4,lsl#2
- vdup.32 $S1,r2
- vdup.32 $R2,r4
- add r4,r5,r5,lsl#2
- vdup.32 $S2,r3
- vdup.32 $R3,r5
- add r5,r6,r6,lsl#2
- vdup.32 $S3,r4
- vdup.32 $R4,r6
- vdup.32 $S4,r5
-
- mov $zeros,#2 @ counter
-
-.Lsquare_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
-
- vmull.u32 $D0,$R0,${R0}[1]
- vmull.u32 $D1,$R1,${R0}[1]
- vmull.u32 $D2,$R2,${R0}[1]
- vmull.u32 $D3,$R3,${R0}[1]
- vmull.u32 $D4,$R4,${R0}[1]
-
- vmlal.u32 $D0,$R4,${S1}[1]
- vmlal.u32 $D1,$R0,${R1}[1]
- vmlal.u32 $D2,$R1,${R1}[1]
- vmlal.u32 $D3,$R2,${R1}[1]
- vmlal.u32 $D4,$R3,${R1}[1]
-
- vmlal.u32 $D0,$R3,${S2}[1]
- vmlal.u32 $D1,$R4,${S2}[1]
- vmlal.u32 $D3,$R1,${R2}[1]
- vmlal.u32 $D2,$R0,${R2}[1]
- vmlal.u32 $D4,$R2,${R2}[1]
-
- vmlal.u32 $D0,$R2,${S3}[1]
- vmlal.u32 $D3,$R0,${R3}[1]
- vmlal.u32 $D1,$R3,${S3}[1]
- vmlal.u32 $D2,$R4,${S3}[1]
- vmlal.u32 $D4,$R1,${R3}[1]
-
- vmlal.u32 $D3,$R4,${S4}[1]
- vmlal.u32 $D0,$R1,${S4}[1]
- vmlal.u32 $D1,$R2,${S4}[1]
- vmlal.u32 $D2,$R3,${S4}[1]
- vmlal.u32 $D4,$R0,${R4}[1]
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- @ and P. Schwabe
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vbic.i32 $D4#lo,#0xfc000000
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vbic.i32 $D2#lo,#0xfc000000
-
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
-
- subs $zeros,$zeros,#1
- beq .Lsquare_break_neon
-
- add $tbl0,$ctx,#(48+0*9*4)
- add $tbl1,$ctx,#(48+1*9*4)
-
- vtrn.32 $R0,$D0#lo @ r^2:r^1
- vtrn.32 $R2,$D2#lo
- vtrn.32 $R3,$D3#lo
- vtrn.32 $R1,$D1#lo
- vtrn.32 $R4,$D4#lo
-
- vshl.u32 $S2,$R2,#2 @ *5
- vshl.u32 $S3,$R3,#2
- vshl.u32 $S1,$R1,#2
- vshl.u32 $S4,$R4,#2
- vadd.i32 $S2,$S2,$R2
- vadd.i32 $S1,$S1,$R1
- vadd.i32 $S3,$S3,$R3
- vadd.i32 $S4,$S4,$R4
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0,:32]
- vst1.32 {${S4}[1]},[$tbl1,:32]
-
- b .Lsquare_neon
-
-.align 4
-.Lsquare_break_neon:
- add $tbl0,$ctx,#(48+2*4*9)
- add $tbl1,$ctx,#(48+3*4*9)
-
- vmov $R0,$D0#lo @ r^4:r^3
- vshl.u32 $S1,$D1#lo,#2 @ *5
- vmov $R1,$D1#lo
- vshl.u32 $S2,$D2#lo,#2
- vmov $R2,$D2#lo
- vshl.u32 $S3,$D3#lo,#2
- vmov $R3,$D3#lo
- vshl.u32 $S4,$D4#lo,#2
- vmov $R4,$D4#lo
- vadd.i32 $S1,$S1,$D1#lo
- vadd.i32 $S2,$S2,$D2#lo
- vadd.i32 $S3,$S3,$D3#lo
- vadd.i32 $S4,$S4,$D4#lo
-
- vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
- vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
- vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vst1.32 {${S4}[0]},[$tbl0]
- vst1.32 {${S4}[1]},[$tbl1]
-
- ret @ bx lr
-.size poly1305_init_neon,.-poly1305_init_neon
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
- ands $len,$len,#-16
- beq .Lno_data_neon
-
- cmp $len,#64
- bhs .Lenter_neon
- tst ip,ip @ is_base2_26?
- beq poly1305_blocks
-
-.Lenter_neon:
- stmdb sp!,{r4-r7}
- vstmdb sp!,{d8-d15} @ ABI specification says so
-
- tst ip,ip @ is_base2_26?
- bne .Lbase2_26_neon
-
- stmdb sp!,{r1-r3,lr}
- bl poly1305_init_neon
-
- ldr r4,[$ctx,#0] @ load hash value base 2^32
- ldr r5,[$ctx,#4]
- ldr r6,[$ctx,#8]
- ldr r7,[$ctx,#12]
- ldr ip,[$ctx,#16]
-
- and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
- mov r3,r4,lsr#26
- veor $D0#lo,$D0#lo,$D0#lo
- mov r4,r5,lsr#20
- orr r3,r3,r5,lsl#6
- veor $D1#lo,$D1#lo,$D1#lo
- mov r5,r6,lsr#14
- orr r4,r4,r6,lsl#12
- veor $D2#lo,$D2#lo,$D2#lo
- mov r6,r7,lsr#8
- orr r5,r5,r7,lsl#18
- veor $D3#lo,$D3#lo,$D3#lo
- and r3,r3,#0x03ffffff
- orr r6,r6,ip,lsl#24
- veor $D4#lo,$D4#lo,$D4#lo
- and r4,r4,#0x03ffffff
- mov r1,#1
- and r5,r5,#0x03ffffff
- str r1,[$ctx,#36] @ is_base2_26
-
- vmov.32 $D0#lo[0],r2
- vmov.32 $D1#lo[0],r3
- vmov.32 $D2#lo[0],r4
- vmov.32 $D3#lo[0],r5
- vmov.32 $D4#lo[0],r6
- adr $zeros,.Lzeros
-
- ldmia sp!,{r1-r3,lr}
- b .Lbase2_32_neon
-
-.align 4
-.Lbase2_26_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ load hash value
-
- veor $D0#lo,$D0#lo,$D0#lo
- veor $D1#lo,$D1#lo,$D1#lo
- veor $D2#lo,$D2#lo,$D2#lo
- veor $D3#lo,$D3#lo,$D3#lo
- veor $D4#lo,$D4#lo,$D4#lo
- vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- adr $zeros,.Lzeros
- vld1.32 {$D4#lo[0]},[$ctx]
- sub $ctx,$ctx,#16 @ rewind
-
-.Lbase2_32_neon:
- add $in2,$inp,#32
- mov $padbit,$padbit,lsl#24
- tst $len,#31
- beq .Leven
-
- vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
- vmov.32 $H4#lo[0],$padbit
- sub $len,$len,#16
- add $in2,$inp,#32
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3#lo,$H3#lo,#18
-
- vsri.u32 $H3#lo,$H2#lo,#14
- vshl.u32 $H2#lo,$H2#lo,#12
- vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi
-
- vbic.i32 $H3#lo,#0xfc000000
- vsri.u32 $H2#lo,$H1#lo,#20
- vshl.u32 $H1#lo,$H1#lo,#6
-
- vbic.i32 $H2#lo,#0xfc000000
- vsri.u32 $H1#lo,$H0#lo,#26
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
-
- vbic.i32 $H0#lo,#0xfc000000
- vbic.i32 $H1#lo,#0xfc000000
- vadd.i32 $H2#hi,$H2#lo,$D2#lo
-
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
-
- mov $tbl1,$zeros
- add $tbl0,$ctx,#48
-
- cmp $len,$len
- b .Long_tail
-
-.align 4
-.Leven:
- subs $len,$len,#64
-# ifdef __thumb2__
- it lo
-# endif
- movlo $in2,$zeros
-
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
-# ifdef __thumb2__
- itt hi
-# endif
- addhi $tbl1,$ctx,#(48+1*9*4)
- addhi $tbl0,$ctx,#(48+3*9*4)
-
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H3,$H3
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
-# endif
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vshl.u32 $H3,$H3,#18
-
- vsri.u32 $H3,$H2,#14
- vshl.u32 $H2,$H2,#12
-
- vbic.i32 $H3,#0xfc000000
- vsri.u32 $H2,$H1,#20
- vshl.u32 $H1,$H1,#6
-
- vbic.i32 $H2,#0xfc000000
- vsri.u32 $H1,$H0,#26
-
- vbic.i32 $H0,#0xfc000000
- vbic.i32 $H1,#0xfc000000
-
- bls .Lskip_loop
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- b .Loop_neon
-
-.align 5
-.Loop_neon:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- @ \___________________/
- @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- @ \___________________/ \____________________/
- @
- @ Note that we start with inp[2:3]*r^2. This is because it
- @ doesn't depend on reduction in previous iteration.
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ inp[2:3]*r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1]
- vmull.u32 $D2,$H2#hi,${R0}[1]
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,${R0}[1]
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,${R0}[1]
- vmlal.u32 $D2,$H1#hi,${R1}[1]
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,${R0}[1]
-
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,${R0}[1]
- subs $len,$len,#64
- vmlal.u32 $D0,$H4#hi,${S1}[1]
-# ifdef __thumb2__
- it lo
-# endif
- movlo $in2,$zeros
- vmlal.u32 $D3,$H2#hi,${R1}[1]
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D1,$H0#hi,${R1}[1]
- vmlal.u32 $D4,$H3#hi,${R1}[1]
-
- vmlal.u32 $D0,$H3#hi,${S2}[1]
- vmlal.u32 $D3,$H1#hi,${R2}[1]
- vmlal.u32 $D4,$H2#hi,${R2}[1]
- vmlal.u32 $D1,$H4#hi,${S2}[1]
- vmlal.u32 $D2,$H0#hi,${R2}[1]
-
- vmlal.u32 $D3,$H0#hi,${R3}[1]
- vmlal.u32 $D0,$H2#hi,${S3}[1]
- vmlal.u32 $D4,$H1#hi,${R3}[1]
- vmlal.u32 $D1,$H3#hi,${S3}[1]
- vmlal.u32 $D2,$H4#hi,${S3}[1]
-
- vmlal.u32 $D3,$H4#hi,${S4}[1]
- vmlal.u32 $D0,$H1#hi,${S4}[1]
- vmlal.u32 $D4,$H0#hi,${R4}[1]
- vmlal.u32 $D1,$H2#hi,${S4}[1]
- vmlal.u32 $D2,$H3#hi,${S4}[1]
-
- vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0)
- add $in2,$in2,#64
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4 and accumulate
-
- vmlal.u32 $D3,$H3#lo,${R0}[0]
- vmlal.u32 $D0,$H0#lo,${R0}[0]
- vmlal.u32 $D4,$H4#lo,${R0}[0]
- vmlal.u32 $D1,$H1#lo,${R0}[0]
- vmlal.u32 $D2,$H2#lo,${R0}[0]
- vld1.32 ${S4}[0],[$tbl0,:32]
-
- vmlal.u32 $D3,$H2#lo,${R1}[0]
- vmlal.u32 $D0,$H4#lo,${S1}[0]
- vmlal.u32 $D4,$H3#lo,${R1}[0]
- vmlal.u32 $D1,$H0#lo,${R1}[0]
- vmlal.u32 $D2,$H1#lo,${R1}[0]
-
- vmlal.u32 $D3,$H1#lo,${R2}[0]
- vmlal.u32 $D0,$H3#lo,${S2}[0]
- vmlal.u32 $D4,$H2#lo,${R2}[0]
- vmlal.u32 $D1,$H4#lo,${S2}[0]
- vmlal.u32 $D2,$H0#lo,${R2}[0]
-
- vmlal.u32 $D3,$H0#lo,${R3}[0]
- vmlal.u32 $D0,$H2#lo,${S3}[0]
- vmlal.u32 $D4,$H1#lo,${R3}[0]
- vmlal.u32 $D1,$H3#lo,${S3}[0]
- vmlal.u32 $D3,$H4#lo,${S4}[0]
-
- vmlal.u32 $D2,$H4#lo,${S3}[0]
- vmlal.u32 $D0,$H1#lo,${S4}[0]
- vmlal.u32 $D4,$H0#lo,${R4}[0]
- vmov.i32 $H4,#1<<24 @ padbit, yes, always
- vmlal.u32 $D1,$H2#lo,${S4}[0]
- vmlal.u32 $D2,$H3#lo,${S4}[0]
-
- vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1]
- add $inp,$inp,#64
-# ifdef __ARMEB__
- vrev32.8 $H0,$H0
- vrev32.8 $H1,$H1
- vrev32.8 $H2,$H2
- vrev32.8 $H3,$H3
-# endif
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction interleaved with base 2^32 -> base 2^26
-
- vshr.u64 $T0,$D3,#26
- vmovn.i64 $D3#lo,$D3
- vshr.u64 $T1,$D0,#26
- vmovn.i64 $D0#lo,$D0
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vbic.i32 $D3#lo,#0xfc000000
- vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
- vshl.u32 $H3,$H3,#18
- vbic.i32 $D0#lo,#0xfc000000
-
- vshrn.u64 $T0#lo,$D4,#26
- vmovn.i64 $D4#lo,$D4
- vshr.u64 $T1,$D1,#26
- vmovn.i64 $D1#lo,$D1
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
- vsri.u32 $H3,$H2,#14
- vbic.i32 $D4#lo,#0xfc000000
- vshl.u32 $H2,$H2,#12
- vbic.i32 $D1#lo,#0xfc000000
-
- vadd.i32 $D0#lo,$D0#lo,$T0#lo
- vshl.u32 $T0#lo,$T0#lo,#2
- vbic.i32 $H3,#0xfc000000
- vshrn.u64 $T1#lo,$D2,#26
- vmovn.i64 $D2#lo,$D2
- vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0
- vsri.u32 $H2,$H1,#20
- vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3
- vshl.u32 $H1,$H1,#6
- vbic.i32 $D2#lo,#0xfc000000
- vbic.i32 $H2,#0xfc000000
-
- vshr.u32 $T0#lo,$D0#lo,#26
- vbic.i32 $D0#lo,#0xfc000000
- vsri.u32 $H1,$H0,#26
- vbic.i32 $H0,#0xfc000000
- vshr.u32 $T1#lo,$D3#lo,#26
- vbic.i32 $D3#lo,#0xfc000000
- vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1
- vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4
- vbic.i32 $H1,#0xfc000000
-
- bhi .Loop_neon
-
-.Lskip_loop:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- add $tbl1,$ctx,#(48+0*9*4)
- add $tbl0,$ctx,#(48+1*9*4)
- adds $len,$len,#32
-# ifdef __thumb2__
- it ne
-# endif
- movne $len,#0
- bne .Long_tail
-
- vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi
- vadd.i32 $H0#hi,$H0#lo,$D0#lo
- vadd.i32 $H3#hi,$H3#lo,$D3#lo
- vadd.i32 $H1#hi,$H1#lo,$D1#lo
- vadd.i32 $H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2
-
- vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant
- vmull.u32 $D2,$H2#hi,$R0
- vadd.i32 $H0#lo,$H0#lo,$D0#lo
- vmull.u32 $D0,$H0#hi,$R0
- vadd.i32 $H3#lo,$H3#lo,$D3#lo
- vmull.u32 $D3,$H3#hi,$R0
- vadd.i32 $H1#lo,$H1#lo,$D1#lo
- vmull.u32 $D1,$H1#hi,$R0
- vadd.i32 $H4#lo,$H4#lo,$D4#lo
- vmull.u32 $D4,$H4#hi,$R0
-
- vmlal.u32 $D0,$H4#hi,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#hi,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#hi,$R1
- vmlal.u32 $D4,$H3#hi,$R1
- vmlal.u32 $D2,$H1#hi,$R1
-
- vmlal.u32 $D3,$H1#hi,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#hi,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#hi,$R2
- vmlal.u32 $D1,$H4#hi,$S2
- vmlal.u32 $D2,$H0#hi,$R2
-
- vmlal.u32 $D3,$H0#hi,$R3
-# ifdef __thumb2__
- it ne
-# endif
- addne $tbl1,$ctx,#(48+2*9*4)
- vmlal.u32 $D0,$H2#hi,$S3
-# ifdef __thumb2__
- it ne
-# endif
- addne $tbl0,$ctx,#(48+3*9*4)
- vmlal.u32 $D4,$H1#hi,$R3
- vmlal.u32 $D1,$H3#hi,$S3
- vmlal.u32 $D2,$H4#hi,$S3
-
- vmlal.u32 $D3,$H4#hi,$S4
- vorn $MASK,$MASK,$MASK @ all-ones, can be redundant
- vmlal.u32 $D0,$H1#hi,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#hi,$R4
- vmlal.u32 $D1,$H2#hi,$S4
- vmlal.u32 $D2,$H3#hi,$S4
-
- beq .Lshort_tail
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ (hash+inp[0:1])*r^4:r^3 and accumulate
-
- vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3
- vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4
-
- vmlal.u32 $D2,$H2#lo,$R0
- vmlal.u32 $D0,$H0#lo,$R0
- vmlal.u32 $D3,$H3#lo,$R0
- vmlal.u32 $D1,$H1#lo,$R0
- vmlal.u32 $D4,$H4#lo,$R0
-
- vmlal.u32 $D0,$H4#lo,$S1
- vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
- vmlal.u32 $D3,$H2#lo,$R1
- vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
- vmlal.u32 $D1,$H0#lo,$R1
- vmlal.u32 $D4,$H3#lo,$R1
- vmlal.u32 $D2,$H1#lo,$R1
-
- vmlal.u32 $D3,$H1#lo,$R2
- vld1.32 ${S4}[1],[$tbl1,:32]
- vmlal.u32 $D0,$H3#lo,$S2
- vld1.32 ${S4}[0],[$tbl0,:32]
- vmlal.u32 $D4,$H2#lo,$R2
- vmlal.u32 $D1,$H4#lo,$S2
- vmlal.u32 $D2,$H0#lo,$R2
-
- vmlal.u32 $D3,$H0#lo,$R3
- vmlal.u32 $D0,$H2#lo,$S3
- vmlal.u32 $D4,$H1#lo,$R3
- vmlal.u32 $D1,$H3#lo,$S3
- vmlal.u32 $D2,$H4#lo,$S3
-
- vmlal.u32 $D3,$H4#lo,$S4
- vorn $MASK,$MASK,$MASK @ all-ones
- vmlal.u32 $D0,$H1#lo,$S4
- vshr.u64 $MASK,$MASK,#38
- vmlal.u32 $D4,$H0#lo,$R4
- vmlal.u32 $D1,$H2#lo,$S4
- vmlal.u32 $D2,$H3#lo,$S4
-
-.Lshort_tail:
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ horizontal addition
-
- vadd.i64 $D3#lo,$D3#lo,$D3#hi
- vadd.i64 $D0#lo,$D0#lo,$D0#hi
- vadd.i64 $D4#lo,$D4#lo,$D4#hi
- vadd.i64 $D1#lo,$D1#lo,$D1#hi
- vadd.i64 $D2#lo,$D2#lo,$D2#hi
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ lazy reduction, but without narrowing
-
- vshr.u64 $T0,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vshr.u64 $T1,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vadd.i64 $D4,$D4,$T0 @ h3 -> h4
- vadd.i64 $D1,$D1,$T1 @ h0 -> h1
-
- vshr.u64 $T0,$D4,#26
- vand.i64 $D4,$D4,$MASK
- vshr.u64 $T1,$D1,#26
- vand.i64 $D1,$D1,$MASK
- vadd.i64 $D2,$D2,$T1 @ h1 -> h2
-
- vadd.i64 $D0,$D0,$T0
- vshl.u64 $T0,$T0,#2
- vshr.u64 $T1,$D2,#26
- vand.i64 $D2,$D2,$MASK
- vadd.i64 $D0,$D0,$T0 @ h4 -> h0
- vadd.i64 $D3,$D3,$T1 @ h2 -> h3
-
- vshr.u64 $T0,$D0,#26
- vand.i64 $D0,$D0,$MASK
- vshr.u64 $T1,$D3,#26
- vand.i64 $D3,$D3,$MASK
- vadd.i64 $D1,$D1,$T0 @ h0 -> h1
- vadd.i64 $D4,$D4,$T1 @ h3 -> h4
-
- cmp $len,#0
- bne .Leven
-
- @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
- @ store hash value
-
- vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
- vst1.32 {$D4#lo[0]},[$ctx]
-
- vldmia sp!,{d8-d15} @ epilogue
- ldmia sp!,{r4-r7}
-.Lno_data_neon:
- ret @ bx lr
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type poly1305_emit_neon,%function
-.align 5
-poly1305_emit_neon:
- ldr ip,[$ctx,#36] @ is_base2_26
-
- stmdb sp!,{r4-r11}
-
- tst ip,ip
- beq .Lpoly1305_emit_enter
-
- ldmia $ctx,{$h0-$h4}
- eor $g0,$g0,$g0
-
- adds $h0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32
- mov $h1,$h1,lsr#6
- adcs $h1,$h1,$h2,lsl#20
- mov $h2,$h2,lsr#12
- adcs $h2,$h2,$h3,lsl#14
- mov $h3,$h3,lsr#18
- adcs $h3,$h3,$h4,lsl#8
- adc $h4,$g0,$h4,lsr#24 @ can be partially reduced ...
-
- and $g0,$h4,#-4 @ ... so reduce
- and $h4,$h3,#3
- add $g0,$g0,$g0,lsr#2 @ *= 5
- adds $h0,$h0,$g0
- adcs $h1,$h1,#0
- adcs $h2,$h2,#0
- adc $h3,$h3,#0
-
- adds $g0,$h0,#5 @ compare to modulus
- adcs $g1,$h1,#0
- adcs $g2,$h2,#0
- adcs $g3,$h3,#0
- adc $g4,$h4,#0
- tst $g4,#4 @ did it carry/borrow?
-
-# ifdef __thumb2__
- it ne
-# endif
- movne $h0,$g0
- ldr $g0,[$nonce,#0]
-# ifdef __thumb2__
- it ne
-# endif
- movne $h1,$g1
- ldr $g1,[$nonce,#4]
-# ifdef __thumb2__
- it ne
-# endif
- movne $h2,$g2
- ldr $g2,[$nonce,#8]
-# ifdef __thumb2__
- it ne
-# endif
- movne $h3,$g3
- ldr $g3,[$nonce,#12]
-
- adds $h0,$h0,$g0 @ accumulate nonce
- adcs $h1,$h1,$g1
- adcs $h2,$h2,$g2
- adc $h3,$h3,$g3
-
-# ifdef __ARMEB__
- rev $h0,$h0
- rev $h1,$h1
- rev $h2,$h2
- rev $h3,$h3
-# endif
- str $h0,[$mac,#0] @ store the result
- str $h1,[$mac,#4]
- str $h2,[$mac,#8]
- str $h3,[$mac,#12]
-
- ldmia sp!,{r4-r11}
- ret @ bx lr
-.size poly1305_emit_neon,.-poly1305_emit_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.LOPENSSL_armcap:
-.word OPENSSL_armcap_P-.Lpoly1305_init
-#endif
-___
-} }
-$code.=<<___;
-.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-#if __ARM_MAX_ARCH__>=7
-.comm OPENSSL_armcap_P,4,4
-#endif
-___
-
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/geo;
-
- s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
- s/\bret\b/bx lr/go or
- s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
-
- print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/src/crypto/poly1305/asm/poly1305-armv8.pl b/src/crypto/poly1305/asm/poly1305-armv8.pl
deleted file mode 100755
index 1d9a81b..0000000
--- a/src/crypto/poly1305/asm/poly1305-armv8.pl
+++ /dev/null
@@ -1,925 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-# IALU/gcc-4.9 NEON
-#
-# Apple A7 1.86/+5% 0.72
-# Cortex-A53 2.63/+58% 1.47
-# Cortex-A57 2.70/+7% 1.14
-# Denver 1.39/+50% 1.18(*)
-# X-Gene 2.00/+68% 2.19
-#
-# (*) estimate based on resources availability is less than 1.0,
-# i.e. measured result is worse than expected, presumably binary
-# translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-die "can't locate arm-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#include <openssl/arm_arch.h>
-
-.text
-
-// forward "declarations" are required for Apple
-.extern OPENSSL_armcap_P
-.globl poly1305_blocks
-.globl poly1305_emit
-
-.globl poly1305_init
-.type poly1305_init,%function
-.align 5
-poly1305_init:
- cmp $inp,xzr
- stp xzr,xzr,[$ctx] // zero hash value
- stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
-
- csel x0,xzr,x0,eq
- b.eq .Lno_key
-
-#ifdef __ILP32__
- ldrsw $t1,.LOPENSSL_armcap_P
-#else
- ldr $t1,.LOPENSSL_armcap_P
-#endif
- adr $t0,.LOPENSSL_armcap_P
-
- ldp $r0,$r1,[$inp] // load key
- mov $s1,#0xfffffffc0fffffff
- movk $s1,#0x0fff,lsl#48
- ldr w17,[$t0,$t1]
-#ifdef __ARMEB__
- rev $r0,$r0 // flip bytes
- rev $r1,$r1
-#endif
- and $r0,$r0,$s1 // &=0ffffffc0fffffff
- and $s1,$s1,#-4
- and $r1,$r1,$s1 // &=0ffffffc0ffffffc
- stp $r0,$r1,[$ctx,#32] // save key value
-
- tst w17,#ARMV7_NEON
-
- adr $d0,poly1305_blocks
- adr $r0,poly1305_blocks_neon
- adr $d1,poly1305_emit
- adr $r1,poly1305_emit_neon
-
- csel $d0,$d0,$r0,eq
- csel $d1,$d1,$r1,eq
-
- stp $d0,$d1,[$len]
-
- mov x0,#1
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,%function
-.align 5
-poly1305_blocks:
- ands $len,$len,#-16
- b.eq .Lno_data
-
- ldp $h0,$h1,[$ctx] // load hash value
- ldp $r0,$r1,[$ctx,#32] // load key value
- ldr $h2,[$ctx,#16]
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- b .Loop
-
-.align 5
-.Loop:
- ldp $t0,$t1,[$inp],#16 // load input
- sub $len,$len,#16
-#ifdef __ARMEB__
- rev $t0,$t0
- rev $t1,$t1
-#endif
- adds $h0,$h0,$t0 // accumulate input
- adcs $h1,$h1,$t1
-
- mul $d0,$h0,$r0 // h0*r0
- adc $h2,$h2,$padbit
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adc $h1,$d1,xzr
-
- cbnz $len,.Loop
-
- stp $h0,$h1,[$ctx] // store hash value
- str $h2,[$ctx,#16]
-
-.Lno_data:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,%function
-.align 5
-poly1305_emit:
- ldp $h0,$h1,[$ctx] // load hash base 2^64
- ldr $h2,[$ctx,#16]
- ldp $t0,$t1,[$nonce] // load nonce
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __ARMEB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __ARMEB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros; # borrow
-
-$code.=<<___;
-.type poly1305_mult,%function
-.align 5
-poly1305_mult:
- mul $d0,$h0,$r0 // h0*r0
- umulh $d1,$h0,$r0
-
- mul $t0,$h1,$s1 // h1*5*r1
- umulh $t1,$h1,$s1
-
- adds $d0,$d0,$t0
- mul $t0,$h0,$r1 // h0*r1
- adc $d1,$d1,$t1
- umulh $d2,$h0,$r1
-
- adds $d1,$d1,$t0
- mul $t0,$h1,$r0 // h1*r0
- adc $d2,$d2,xzr
- umulh $t1,$h1,$r0
-
- adds $d1,$d1,$t0
- mul $t0,$h2,$s1 // h2*5*r1
- adc $d2,$d2,$t1
- mul $t1,$h2,$r0 // h2*r0
-
- adds $d1,$d1,$t0
- adc $d2,$d2,$t1
-
- and $t0,$d2,#-4 // final reduction
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$d0,$t0
- adc $h1,$d1,xzr
-
- ret
-.size poly1305_mult,.-poly1305_mult
-
-.type poly1305_splat,%function
-.align 5
-poly1305_splat:
- and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x13,$h0,#26,#26
- extr x14,$h1,$h0,#52
- and x14,x14,#0x03ffffff
- ubfx x15,$h1,#14,#26
- extr x16,$h2,$h1,#40
-
- str w12,[$ctx,#16*0] // r0
- add w12,w13,w13,lsl#2 // r1*5
- str w13,[$ctx,#16*1] // r1
- add w13,w14,w14,lsl#2 // r2*5
- str w12,[$ctx,#16*2] // s1
- str w14,[$ctx,#16*3] // r2
- add w14,w15,w15,lsl#2 // r3*5
- str w13,[$ctx,#16*4] // s2
- str w15,[$ctx,#16*5] // r3
- add w15,w16,w16,lsl#2 // r4*5
- str w14,[$ctx,#16*6] // s3
- str w16,[$ctx,#16*7] // r4
- str w15,[$ctx,#16*8] // s4
-
- ret
-.size poly1305_splat,.-poly1305_splat
-
-.type poly1305_blocks_neon,%function
-.align 5
-poly1305_blocks_neon:
- ldr $is_base2_26,[$ctx,#24]
- cmp $len,#128
- b.hs .Lblocks_neon
- cbz $is_base2_26,poly1305_blocks
-
-.Lblocks_neon:
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
-
- ands $len,$len,#-16
- b.eq .Lno_data_neon
-
- cbz $is_base2_26,.Lbase2_64_neon
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- tst $len,#31
- b.eq .Leven_neon
-
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $d2,$h2,xzr // can be partially reduced...
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-
- and $t0,$d2,#-4 // ... so reduce
- and $h2,$d2,#3
- add $t0,$t0,$d2,lsr#2
- adds $h0,$h0,$t0
- adc $h1,$h1,xzr
-
-#ifdef __ARMEB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl poly1305_mult
- ldr x30,[sp,#8]
-
- cbz $padbit,.Lstore_base2_64_neon
-
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- cbnz $len,.Leven_neon
-
- stp w10,w11,[$ctx] // store hash value base 2^26
- stp w12,w13,[$ctx,#8]
- str w14,[$ctx,#16]
- b .Lno_data_neon
-
-.align 4
-.Lstore_base2_64_neon:
- stp $h0,$h1,[$ctx] // store hash value base 2^64
- stp $h2,xzr,[$ctx,#16] // note that is_base2_26 is zeroed
- b .Lno_data_neon
-
-.align 4
-.Lbase2_64_neon:
- ldp $r0,$r1,[$ctx,#32] // load key value
-
- ldp $h0,$h1,[$ctx] // load hash value base 2^64
- ldr $h2,[$ctx,#16]
-
- tst $len,#31
- b.eq .Linit_neon
-
- ldp $d0,$d1,[$inp],#16 // load input
- sub $len,$len,#16
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
-#ifdef __ARMEB__
- rev $d0,$d0
- rev $d1,$d1
-#endif
- adds $h0,$h0,$d0 // accumulate input
- adcs $h1,$h1,$d1
- adc $h2,$h2,$padbit
-
- bl poly1305_mult
-
-.Linit_neon:
- and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
- ubfx x11,$h0,#26,#26
- extr x12,$h1,$h0,#52
- and x12,x12,#0x03ffffff
- ubfx x13,$h1,#14,#26
- extr x14,$h2,$h1,#40
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
- ////////////////////////////////// initialize r^n table
- mov $h0,$r0 // r^1
- add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
- mov $h1,$r1
- mov $h2,xzr
- add $ctx,$ctx,#48+12
- bl poly1305_splat
-
- bl poly1305_mult // r^2
- sub $ctx,$ctx,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^3
- sub $ctx,$ctx,#4
- bl poly1305_splat
-
- bl poly1305_mult // r^4
- sub $ctx,$ctx,#4
- bl poly1305_splat
- ldr x30,[sp,#8]
-
- add $in2,$inp,#32
- adr $zeros,.Lzeros
- subs $len,$len,#64
- csel $in2,$zeros,$in2,lo
-
- mov x4,#1
- str x4,[$ctx,#-24] // set is_base2_26
- sub $ctx,$ctx,#48 // restore original $ctx
- b .Ldo_neon
-
-.align 4
-.Leven_neon:
- add $in2,$inp,#32
- adr $zeros,.Lzeros
- subs $len,$len,#64
- csel $in2,$zeros,$in2,lo
-
- stp d8,d9,[sp,#16] // meet ABI requirements
- stp d10,d11,[sp,#32]
- stp d12,d13,[sp,#48]
- stp d14,d15,[sp,#64]
-
- fmov ${H0},x10
- fmov ${H1},x11
- fmov ${H2},x12
- fmov ${H3},x13
- fmov ${H4},x14
-
-.Ldo_neon:
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- ldp x9,x13,[$in2],#48
-
- lsl $padbit,$padbit,#24
- add x15,$ctx,#48
-
-#ifdef __ARMEB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN23_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN23_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov $IN23_2,x8
- fmov $IN23_3,x10
- fmov $IN23_4,x12
-
- ldp x8,x12,[$inp],#16 // inp[0:1]
- ldp x9,x13,[$inp],#48
-
- ld1 {$R0,$R1,$S1,$R2},[x15],#64
- ld1 {$S2,$R3,$S3,$R4},[x15],#64
- ld1 {$S4},[x15]
-
-#ifdef __ARMEB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- and x5,x9,#0x03ffffff
- ubfx x6,x8,#26,#26
- ubfx x7,x9,#26,#26
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- extr x8,x12,x8,#52
- extr x9,x13,x9,#52
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- fmov $IN01_0,x4
- and x8,x8,#0x03ffffff
- and x9,x9,#0x03ffffff
- ubfx x10,x12,#14,#26
- ubfx x11,x13,#14,#26
- add x12,$padbit,x12,lsr#40
- add x13,$padbit,x13,lsr#40
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- fmov $IN01_1,x6
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- fmov $IN01_2,x8
- fmov $IN01_3,x10
- fmov $IN01_4,x12
-
- b.ls .Lskip_loop
-
-.align 4
-.Loop_neon:
- ////////////////////////////////////////////////////////////////
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- // \___________________/
- // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- // \___________________/ \____________________/
- //
- // Note that we start with inp[2:3]*r^2. This is because it
- // doesn't depend on reduction in previous iteration.
- ////////////////////////////////////////////////////////////////
- // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
- // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
- // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
- // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
- // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- subs $len,$len,#64
- umull $ACC4,$IN23_0,${R4}[2]
- csel $in2,$zeros,$in2,lo
- umull $ACC3,$IN23_0,${R3}[2]
- umull $ACC2,$IN23_0,${R2}[2]
- ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
- umull $ACC1,$IN23_0,${R1}[2]
- ldp x9,x13,[$in2],#48
- umull $ACC0,$IN23_0,${R0}[2]
-#ifdef __ARMEB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- umlal $ACC4,$IN23_1,${R3}[2]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC3,$IN23_1,${R2}[2]
- and x5,x9,#0x03ffffff
- umlal $ACC2,$IN23_1,${R1}[2]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN23_1,${R0}[2]
- ubfx x7,x9,#26,#26
- umlal $ACC0,$IN23_1,${S4}[2]
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
-
- umlal $ACC4,$IN23_2,${R2}[2]
- extr x8,x12,x8,#52
- umlal $ACC3,$IN23_2,${R1}[2]
- extr x9,x13,x9,#52
- umlal $ACC2,$IN23_2,${R0}[2]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC1,$IN23_2,${S4}[2]
- fmov $IN23_0,x4
- umlal $ACC0,$IN23_2,${S3}[2]
- and x8,x8,#0x03ffffff
-
- umlal $ACC4,$IN23_3,${R1}[2]
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN23_3,${R0}[2]
- ubfx x10,x12,#14,#26
- umlal $ACC2,$IN23_3,${S4}[2]
- ubfx x11,x13,#14,#26
- umlal $ACC1,$IN23_3,${S3}[2]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC0,$IN23_3,${S2}[2]
- fmov $IN23_1,x6
-
- add $IN01_2,$IN01_2,$H2
- add x12,$padbit,x12,lsr#40
- umlal $ACC4,$IN23_4,${R0}[2]
- add x13,$padbit,x13,lsr#40
- umlal $ACC3,$IN23_4,${S4}[2]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC2,$IN23_4,${S3}[2]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN23_4,${S2}[2]
- fmov $IN23_2,x8
- umlal $ACC0,$IN23_4,${S1}[2]
- fmov $IN23_3,x10
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- fmov $IN23_4,x12
- umlal $ACC3,$IN01_2,${R1}[0]
- ldp x8,x12,[$inp],#16 // inp[0:1]
- umlal $ACC0,$IN01_2,${S3}[0]
- ldp x9,x13,[$inp],#48
- umlal $ACC4,$IN01_2,${R2}[0]
- umlal $ACC1,$IN01_2,${S4}[0]
- umlal $ACC2,$IN01_2,${R0}[0]
-#ifdef __ARMEB__
- rev x8,x8
- rev x12,x12
- rev x9,x9
- rev x13,x13
-#endif
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}[0]
- umlal $ACC4,$IN01_0,${R4}[0]
- and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
- umlal $ACC2,$IN01_0,${R2}[0]
- and x5,x9,#0x03ffffff
- umlal $ACC0,$IN01_0,${R0}[0]
- ubfx x6,x8,#26,#26
- umlal $ACC1,$IN01_0,${R1}[0]
- ubfx x7,x9,#26,#26
-
- add $IN01_3,$IN01_3,$H3
- add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
- umlal $ACC3,$IN01_1,${R2}[0]
- extr x8,x12,x8,#52
- umlal $ACC4,$IN01_1,${R3}[0]
- extr x9,x13,x9,#52
- umlal $ACC0,$IN01_1,${S4}[0]
- add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
- umlal $ACC2,$IN01_1,${R1}[0]
- fmov $IN01_0,x4
- umlal $ACC1,$IN01_1,${R0}[0]
- and x8,x8,#0x03ffffff
-
- add $IN01_4,$IN01_4,$H4
- and x9,x9,#0x03ffffff
- umlal $ACC3,$IN01_3,${R0}[0]
- ubfx x10,x12,#14,#26
- umlal $ACC0,$IN01_3,${S2}[0]
- ubfx x11,x13,#14,#26
- umlal $ACC4,$IN01_3,${R1}[0]
- add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
- umlal $ACC1,$IN01_3,${S3}[0]
- fmov $IN01_1,x6
- umlal $ACC2,$IN01_3,${S4}[0]
- add x12,$padbit,x12,lsr#40
-
- umlal $ACC3,$IN01_4,${S4}[0]
- add x13,$padbit,x13,lsr#40
- umlal $ACC0,$IN01_4,${S1}[0]
- add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
- umlal $ACC4,$IN01_4,${R0}[0]
- add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
- umlal $ACC1,$IN01_4,${S2}[0]
- fmov $IN01_2,x8
- umlal $ACC2,$IN01_4,${S3}[0]
- fmov $IN01_3,x10
-
- /////////////////////////////////////////////////////////////////
- // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- // and P. Schwabe
-
- ushr $T0.2d,$ACC3,#26
- fmov $IN01_4,x12
- xtn $H3,$ACC3
- ushr $T1.2d,$ACC0,#26
- xtn $H0,$ACC0
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- bic $H3,#0xfc,lsl#24 // &=0x03ffffff
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
- bic $H0,#0xfc,lsl#24
-
- shrn $T0.2s,$ACC4,#26
- xtn $H4,$ACC4
- ushr $T1.2d,$ACC1,#26
- xtn $H1,$ACC1
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
- bic $H4,#0xfc,lsl#24
- bic $H1,#0xfc,lsl#24
-
- add $H0,$H0,$T0.2s
- shl $T0.2s,$T0.2s,#2
- shrn $T1.2s,$ACC2,#26
- xtn $H2,$ACC2
- add $H0,$H0,$T0.2s // h4 -> h0
- add $H3,$H3,$T1.2s // h2 -> h3
- bic $H2,#0xfc,lsl#24
-
- ushr $T0.2s,$H0,#26
- bic $H0,#0xfc,lsl#24
- ushr $T1.2s,$H3,#26
- bic $H3,#0xfc,lsl#24
- add $H1,$H1,$T0.2s // h0 -> h1
- add $H4,$H4,$T1.2s // h3 -> h4
-
- b.hi .Loop_neon
-
-.Lskip_loop:
- dup $IN23_2,${IN23_2}[0]
- movi $MASK.2d,#-1
- add $IN01_2,$IN01_2,$H2
- ushr $MASK.2d,$MASK.2d,#38
-
- ////////////////////////////////////////////////////////////////
- // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- adds $len,$len,#32
- b.ne .Long_tail
-
- dup $IN23_2,${IN01_2}[0]
- add $IN23_0,$IN01_0,$H0
- add $IN23_3,$IN01_3,$H3
- add $IN23_1,$IN01_1,$H1
- add $IN23_4,$IN01_4,$H4
-
-.Long_tail:
- dup $IN23_0,${IN23_0}[0]
- umull2 $ACC0,$IN23_2,${S3}
- umull2 $ACC3,$IN23_2,${R1}
- umull2 $ACC4,$IN23_2,${R2}
- umull2 $ACC2,$IN23_2,${R0}
- umull2 $ACC1,$IN23_2,${S4}
-
- dup $IN23_1,${IN23_1}[0]
- umlal2 $ACC0,$IN23_0,${R0}
- umlal2 $ACC2,$IN23_0,${R2}
- umlal2 $ACC3,$IN23_0,${R3}
- umlal2 $ACC4,$IN23_0,${R4}
- umlal2 $ACC1,$IN23_0,${R1}
-
- dup $IN23_3,${IN23_3}[0]
- umlal2 $ACC0,$IN23_1,${S4}
- umlal2 $ACC3,$IN23_1,${R2}
- umlal2 $ACC2,$IN23_1,${R1}
- umlal2 $ACC4,$IN23_1,${R3}
- umlal2 $ACC1,$IN23_1,${R0}
-
- dup $IN23_4,${IN23_4}[0]
- umlal2 $ACC3,$IN23_3,${R0}
- umlal2 $ACC4,$IN23_3,${R1}
- umlal2 $ACC0,$IN23_3,${S2}
- umlal2 $ACC1,$IN23_3,${S3}
- umlal2 $ACC2,$IN23_3,${S4}
-
- umlal2 $ACC3,$IN23_4,${S4}
- umlal2 $ACC0,$IN23_4,${S1}
- umlal2 $ACC4,$IN23_4,${R0}
- umlal2 $ACC1,$IN23_4,${S2}
- umlal2 $ACC2,$IN23_4,${S3}
-
- b.eq .Lshort_tail
-
- ////////////////////////////////////////////////////////////////
- // (hash+inp[0:1])*r^4:r^3 and accumulate
-
- add $IN01_0,$IN01_0,$H0
- umlal $ACC3,$IN01_2,${R1}
- umlal $ACC0,$IN01_2,${S3}
- umlal $ACC4,$IN01_2,${R2}
- umlal $ACC1,$IN01_2,${S4}
- umlal $ACC2,$IN01_2,${R0}
-
- add $IN01_1,$IN01_1,$H1
- umlal $ACC3,$IN01_0,${R3}
- umlal $ACC0,$IN01_0,${R0}
- umlal $ACC4,$IN01_0,${R4}
- umlal $ACC1,$IN01_0,${R1}
- umlal $ACC2,$IN01_0,${R2}
-
- add $IN01_3,$IN01_3,$H3
- umlal $ACC3,$IN01_1,${R2}
- umlal $ACC0,$IN01_1,${S4}
- umlal $ACC4,$IN01_1,${R3}
- umlal $ACC1,$IN01_1,${R0}
- umlal $ACC2,$IN01_1,${R1}
-
- add $IN01_4,$IN01_4,$H4
- umlal $ACC3,$IN01_3,${R0}
- umlal $ACC0,$IN01_3,${S2}
- umlal $ACC4,$IN01_3,${R1}
- umlal $ACC1,$IN01_3,${S3}
- umlal $ACC2,$IN01_3,${S4}
-
- umlal $ACC3,$IN01_4,${S4}
- umlal $ACC0,$IN01_4,${S1}
- umlal $ACC4,$IN01_4,${R0}
- umlal $ACC1,$IN01_4,${S2}
- umlal $ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
- ////////////////////////////////////////////////////////////////
- // horizontal add
-
- addp $ACC3,$ACC3,$ACC3
- ldp d8,d9,[sp,#16] // meet ABI requirements
- addp $ACC0,$ACC0,$ACC0
- ldp d10,d11,[sp,#32]
- addp $ACC4,$ACC4,$ACC4
- ldp d12,d13,[sp,#48]
- addp $ACC1,$ACC1,$ACC1
- ldp d14,d15,[sp,#64]
- addp $ACC2,$ACC2,$ACC2
-
- ////////////////////////////////////////////////////////////////
- // lazy reduction, but without narrowing
-
- ushr $T0.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- ushr $T1.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
-
- add $ACC4,$ACC4,$T0.2d // h3 -> h4
- add $ACC1,$ACC1,$T1.2d // h0 -> h1
-
- ushr $T0.2d,$ACC4,#26
- and $ACC4,$ACC4,$MASK.2d
- ushr $T1.2d,$ACC1,#26
- and $ACC1,$ACC1,$MASK.2d
- add $ACC2,$ACC2,$T1.2d // h1 -> h2
-
- add $ACC0,$ACC0,$T0.2d
- shl $T0.2d,$T0.2d,#2
- ushr $T1.2d,$ACC2,#26
- and $ACC2,$ACC2,$MASK.2d
- add $ACC0,$ACC0,$T0.2d // h4 -> h0
- add $ACC3,$ACC3,$T1.2d // h2 -> h3
-
- ushr $T0.2d,$ACC0,#26
- and $ACC0,$ACC0,$MASK.2d
- ushr $T1.2d,$ACC3,#26
- and $ACC3,$ACC3,$MASK.2d
- add $ACC1,$ACC1,$T0.2d // h0 -> h1
- add $ACC4,$ACC4,$T1.2d // h3 -> h4
-
- ////////////////////////////////////////////////////////////////
- // write the result, can be partially reduced
-
- st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
- st1 {$ACC4}[0],[$ctx]
-
-.Lno_data_neon:
- ldr x29,[sp],#80
- ret
-.size poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type poly1305_emit_neon,%function
-.align 5
-poly1305_emit_neon:
- ldr $is_base2_26,[$ctx,#24]
- cbz $is_base2_26,poly1305_emit
-
- ldp w10,w11,[$ctx] // load hash value base 2^26
- ldp w12,w13,[$ctx,#8]
- ldr w14,[$ctx,#16]
-
- add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
- lsr $h1,x12,#12
- adds $h0,$h0,x12,lsl#52
- add $h1,$h1,x13,lsl#14
- adc $h1,$h1,xzr
- lsr $h2,x14,#24
- adds $h1,$h1,x14,lsl#40
- adc $h2,$h2,xzr // can be partially reduced...
-
- ldp $t0,$t1,[$nonce] // load nonce
-
- and $d0,$h2,#-4 // ... so reduce
- add $d0,$d0,$h2,lsr#2
- and $h2,$h2,#3
- adds $h0,$h0,$d0
- adc $h1,$h1,xzr
-
- adds $d0,$h0,#5 // compare to modulus
- adcs $d1,$h1,xzr
- adc $d2,$h2,xzr
-
- tst $d2,#-4 // see if it's carried/borrowed
-
- csel $h0,$h0,$d0,eq
- csel $h1,$h1,$d1,eq
-
-#ifdef __ARMEB__
- ror $t0,$t0,#32 // flip nonce words
- ror $t1,$t1,#32
-#endif
- adds $h0,$h0,$t0 // accumulate nonce
- adc $h1,$h1,$t1
-#ifdef __ARMEB__
- rev $h0,$h0 // flip output bytes
- rev $h1,$h1
-#endif
- stp $h0,$h1,[$mac] // write result
-
- ret
-.size poly1305_emit_neon,.-poly1305_emit_neon
-
-.align 5
-.Lzeros:
-.long 0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
-.asciz "Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align 2
-___
-
-foreach (split("\n",$code)) {
- s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
- s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
- (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
- (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
- (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
- (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
- (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
- s/\.[124]([sd])\[/.$1\[/;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/src/crypto/poly1305/asm/poly1305-x86.pl b/src/crypto/poly1305/asm/poly1305-x86.pl
deleted file mode 100755
index 6843995..0000000
--- a/src/crypto/poly1305/asm/poly1305-x86.pl
+++ /dev/null
@@ -1,1793 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86.
-#
-# April 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-3.4(*) SSE2(**) AVX2
-# Pentium 15.7/+80% -
-# PIII 6.21/+90% -
-# P4 19.8/+40% 3.24
-# Core 2 4.85/+90% 1.80
-# Westmere 4.58/+100% 1.43
-# Sandy Bridge 3.90/+100% 1.36
-# Haswell 3.88/+70% 1.18 0.72
-# Silvermont 11.0/+40% 4.80
-# VIA Nano 6.71/+90% 2.47
-# Sledgehammer 3.51/+180% 4.27
-# Bulldozer 4.53/+140% 1.31
-#
-# (*) gcc 4.8 for some reason generated worse code;
-# (**) besides SSE2 there are floating-point and AVX options; FP
-# is deemed unnecessary, because pre-SSE2 processor are too
-# old to care about, while it's not the fastest option on
-# SSE2-capable ones; AVX is omitted, because it doesn't give
-# a lot of improvement, 5-10% depending on processor;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-$output=pop;
-open STDOUT,">$output";
-
-&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386");
-
-$sse2=$avx=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-if ($sse2) {
- &static_label("const_sse2");
- &static_label("enter_blocks");
- &static_label("enter_emit");
- &external_label("OPENSSL_ia32cap_P");
-
- # This may be set to 2, but valgrind can't do AVX2 on 32-bit. Without a
- # way to verify test coverage, keep it disabled.
- $avx = 0;
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^32
-# unsigned __int32 pad; # is_base2_26 in vector context
-# unsigned __int32 r[4]; # key value base 2^32
-
-&align(64);
-&function_begin("poly1305_init");
- &mov ("edi",&wparam(0)); # context
- &mov ("esi",&wparam(1)); # key
- &mov ("ebp",&wparam(2)); # function table
-
- &xor ("eax","eax");
- &mov (&DWP(4*0,"edi"),"eax"); # zero hash value
- &mov (&DWP(4*1,"edi"),"eax");
- &mov (&DWP(4*2,"edi"),"eax");
- &mov (&DWP(4*3,"edi"),"eax");
- &mov (&DWP(4*4,"edi"),"eax");
- &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26
-
- &cmp ("esi",0);
- &je (&label("nokey"));
-
- if ($sse2) {
- &call (&label("pic_point"));
- &set_label("pic_point");
- &blindpop("ebx");
-
- &lea ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
- &lea ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
-
- &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
- &mov ("ecx",&DWP(0,"edi"));
- &and ("ecx",1<<26|1<<24);
- &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM?
- &jne (&label("no_sse2"));
-
- &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
- &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
-
- if ($avx>1) {
- &mov ("ecx",&DWP(8,"edi"));
- &test ("ecx",1<<5); # AVX2?
- &jz (&label("no_sse2"));
-
- &lea ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
- }
- &set_label("no_sse2");
- &mov ("edi",&wparam(0)); # reload context
- &mov (&DWP(0,"ebp"),"eax"); # fill function table
- &mov (&DWP(4,"ebp"),"edx");
- }
-
- &mov ("eax",&DWP(4*0,"esi")); # load input key
- &mov ("ebx",&DWP(4*1,"esi"));
- &mov ("ecx",&DWP(4*2,"esi"));
- &mov ("edx",&DWP(4*3,"esi"));
- &and ("eax",0x0fffffff);
- &and ("ebx",0x0ffffffc);
- &and ("ecx",0x0ffffffc);
- &and ("edx",0x0ffffffc);
- &mov (&DWP(4*6,"edi"),"eax");
- &mov (&DWP(4*7,"edi"),"ebx");
- &mov (&DWP(4*8,"edi"),"ecx");
- &mov (&DWP(4*9,"edi"),"edx");
-
- &mov ("eax",$sse2);
-&set_label("nokey");
-&function_end("poly1305_init");
-
-($h0,$h1,$h2,$h3,$h4,
- $d0,$d1,$d2,$d3,
- $r0,$r1,$r2,$r3,
- $s1,$s2,$s3)=map(4*$_,(0..15));
-
-&function_begin("poly1305_blocks");
- &mov ("edi",&wparam(0)); # ctx
- &mov ("esi",&wparam(1)); # inp
- &mov ("ecx",&wparam(2)); # len
-&set_label("enter_blocks");
- &and ("ecx",-15);
- &jz (&label("nodata"));
-
- &stack_push(16);
- &mov ("eax",&DWP(4*6,"edi")); # r0
- &mov ("ebx",&DWP(4*7,"edi")); # r1
- &lea ("ebp",&DWP(0,"esi","ecx")); # end of input
- &mov ("ecx",&DWP(4*8,"edi")); # r2
- &mov ("edx",&DWP(4*9,"edi")); # r3
-
- &mov (&wparam(2),"ebp");
- &mov ("ebp","esi");
-
- &mov (&DWP($r0,"esp"),"eax"); # r0
- &mov ("eax","ebx");
- &shr ("eax",2);
- &mov (&DWP($r1,"esp"),"ebx"); # r1
- &add ("eax","ebx"); # s1
- &mov ("ebx","ecx");
- &shr ("ebx",2);
- &mov (&DWP($r2,"esp"),"ecx"); # r2
- &add ("ebx","ecx"); # s2
- &mov ("ecx","edx");
- &shr ("ecx",2);
- &mov (&DWP($r3,"esp"),"edx"); # r3
- &add ("ecx","edx"); # s3
- &mov (&DWP($s1,"esp"),"eax"); # s1
- &mov (&DWP($s2,"esp"),"ebx"); # s2
- &mov (&DWP($s3,"esp"),"ecx"); # s3
-
- &mov ("eax",&DWP(4*0,"edi")); # load hash value
- &mov ("ebx",&DWP(4*1,"edi"));
- &mov ("ecx",&DWP(4*2,"edi"));
- &mov ("esi",&DWP(4*3,"edi"));
- &mov ("edi",&DWP(4*4,"edi"));
- &jmp (&label("loop"));
-
-&set_label("loop",32);
- &add ("eax",&DWP(4*0,"ebp")); # accumulate input
- &adc ("ebx",&DWP(4*1,"ebp"));
- &adc ("ecx",&DWP(4*2,"ebp"));
- &adc ("esi",&DWP(4*3,"ebp"));
- &lea ("ebp",&DWP(4*4,"ebp"));
- &adc ("edi",&wparam(3)); # padbit
-
- &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp]
- &mov (&DWP($h3,"esp"),"esi");
-
- &mul (&DWP($r0,"esp")); # h0*r0
- &mov (&DWP($h4,"esp"),"edi");
- &mov ("edi","eax");
- &mov ("eax","ebx"); # h1
- &mov ("esi","edx");
- &mul (&DWP($s3,"esp")); # h1*s3
- &add ("edi","eax");
- &mov ("eax","ecx"); # h2
- &adc ("esi","edx");
- &mul (&DWP($s2,"esp")); # h2*s2
- &add ("edi","eax");
- &mov ("eax",&DWP($h3,"esp"));
- &adc ("esi","edx");
- &mul (&DWP($s1,"esp")); # h3*s1
- &add ("edi","eax");
- &mov ("eax",&DWP($h0,"esp"));
- &adc ("esi","edx");
-
- &mul (&DWP($r1,"esp")); # h0*r1
- &mov (&DWP($d0,"esp"),"edi");
- &xor ("edi","edi");
- &add ("esi","eax");
- &mov ("eax","ebx"); # h1
- &adc ("edi","edx");
- &mul (&DWP($r0,"esp")); # h1*r0
- &add ("esi","eax");
- &mov ("eax","ecx"); # h2
- &adc ("edi","edx");
- &mul (&DWP($s3,"esp")); # h2*s3
- &add ("esi","eax");
- &mov ("eax",&DWP($h3,"esp"));
- &adc ("edi","edx");
- &mul (&DWP($s2,"esp")); # h3*s2
- &add ("esi","eax");
- &mov ("eax",&DWP($h4,"esp"));
- &adc ("edi","edx");
- &imul ("eax",&DWP($s1,"esp")); # h4*s1
- &add ("esi","eax");
- &mov ("eax",&DWP($h0,"esp"));
- &adc ("edi",0);
-
- &mul (&DWP($r2,"esp")); # h0*r2
- &mov (&DWP($d1,"esp"),"esi");
- &xor ("esi","esi");
- &add ("edi","eax");
- &mov ("eax","ebx"); # h1
- &adc ("esi","edx");
- &mul (&DWP($r1,"esp")); # h1*r1
- &add ("edi","eax");
- &mov ("eax","ecx"); # h2
- &adc ("esi","edx");
- &mul (&DWP($r0,"esp")); # h2*r0
- &add ("edi","eax");
- &mov ("eax",&DWP($h3,"esp"));
- &adc ("esi","edx");
- &mul (&DWP($s3,"esp")); # h3*s3
- &add ("edi","eax");
- &mov ("eax",&DWP($h4,"esp"));
- &adc ("esi","edx");
- &imul ("eax",&DWP($s2,"esp")); # h4*s2
- &add ("edi","eax");
- &mov ("eax",&DWP($h0,"esp"));
- &adc ("esi",0);
-
- &mul (&DWP($r3,"esp")); # h0*r3
- &mov (&DWP($d2,"esp"),"edi");
- &xor ("edi","edi");
- &add ("esi","eax");
- &mov ("eax","ebx"); # h1
- &adc ("edi","edx");
- &mul (&DWP($r2,"esp")); # h1*r2
- &add ("esi","eax");
- &mov ("eax","ecx"); # h2
- &adc ("edi","edx");
- &mul (&DWP($r1,"esp")); # h2*r1
- &add ("esi","eax");
- &mov ("eax",&DWP($h3,"esp"));
- &adc ("edi","edx");
- &mul (&DWP($r0,"esp")); # h3*r0
- &add ("esi","eax");
- &mov ("ecx",&DWP($h4,"esp"));
- &adc ("edi","edx");
-
- &mov ("edx","ecx");
- &imul ("ecx",&DWP($s3,"esp")); # h4*s3
- &add ("esi","ecx");
- &mov ("eax",&DWP($d0,"esp"));
- &adc ("edi",0);
-
- &imul ("edx",&DWP($r0,"esp")); # h4*r0
- &add ("edx","edi");
-
- &mov ("ebx",&DWP($d1,"esp"));
- &mov ("ecx",&DWP($d2,"esp"));
-
- &mov ("edi","edx"); # last reduction step
- &shr ("edx",2);
- &and ("edi",3);
- &lea ("edx",&DWP(0,"edx","edx",4)); # *5
- &add ("eax","edx");
- &adc ("ebx",0);
- &adc ("ecx",0);
- &adc ("esi",0);
-
- &cmp ("ebp",&wparam(2)); # done yet?
- &jne (&label("loop"));
-
- &mov ("edx",&wparam(0)); # ctx
- &stack_pop(16);
- &mov (&DWP(4*0,"edx"),"eax"); # store hash value
- &mov (&DWP(4*1,"edx"),"ebx");
- &mov (&DWP(4*2,"edx"),"ecx");
- &mov (&DWP(4*3,"edx"),"esi");
- &mov (&DWP(4*4,"edx"),"edi");
-&set_label("nodata");
-&function_end("poly1305_blocks");
-
-&function_begin("poly1305_emit");
- &mov ("ebp",&wparam(0)); # context
-&set_label("enter_emit");
- &mov ("edi",&wparam(1)); # output
- &mov ("eax",&DWP(4*0,"ebp")); # load hash value
- &mov ("ebx",&DWP(4*1,"ebp"));
- &mov ("ecx",&DWP(4*2,"ebp"));
- &mov ("edx",&DWP(4*3,"ebp"));
- &mov ("esi",&DWP(4*4,"ebp"));
-
- &add ("eax",5); # compare to modulus
- &adc ("ebx",0);
- &adc ("ecx",0);
- &adc ("edx",0);
- &adc ("esi",0);
- &shr ("esi",2); # did it carry/borrow?
- &neg ("esi"); # do we choose hash-modulus?
-
- &and ("eax","esi");
- &and ("ebx","esi");
- &and ("ecx","esi");
- &and ("edx","esi");
- &mov (&DWP(4*0,"edi"),"eax");
- &mov (&DWP(4*1,"edi"),"ebx");
- &mov (&DWP(4*2,"edi"),"ecx");
- &mov (&DWP(4*3,"edi"),"edx");
-
- ¬ ("esi"); # or original hash value?
- &mov ("eax",&DWP(4*0,"ebp"));
- &mov ("ebx",&DWP(4*1,"ebp"));
- &mov ("ecx",&DWP(4*2,"ebp"));
- &mov ("edx",&DWP(4*3,"ebp"));
- &mov ("ebp",&wparam(2));
- &and ("eax","esi");
- &and ("ebx","esi");
- &and ("ecx","esi");
- &and ("edx","esi");
- &or ("eax",&DWP(4*0,"edi"));
- &or ("ebx",&DWP(4*1,"edi"));
- &or ("ecx",&DWP(4*2,"edi"));
- &or ("edx",&DWP(4*3,"edi"));
-
- &add ("eax",&DWP(4*0,"ebp")); # accumulate key
- &adc ("ebx",&DWP(4*1,"ebp"));
- &adc ("ecx",&DWP(4*2,"ebp"));
- &adc ("edx",&DWP(4*3,"ebp"));
-
- &mov (&DWP(4*0,"edi"),"eax");
- &mov (&DWP(4*1,"edi"),"ebx");
- &mov (&DWP(4*2,"edi"),"ecx");
- &mov (&DWP(4*3,"edi"),"edx");
-&function_end("poly1305_emit");
-
-if ($sse2) {
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int32 r[4]; # key value base 2^32
-# unsigned __int32 pad[2];
-# struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
-my $MASK=$T2; # borrow and keep in mind
-
-&align (32);
-&function_begin_B("_poly1305_init_sse2");
- &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32
- &lea ("edi",&DWP(16*3,"edi")); # size optimization
- &mov ("ebp","esp");
- &sub ("esp",16*(9+5));
- &and ("esp",-16);
-
- #&pand ($D4,&QWP(96,"ebx")); # magic mask
- &movq ($MASK,&QWP(64,"ebx"));
-
- &movdqa ($D0,$D4);
- &movdqa ($D1,$D4);
- &movdqa ($D2,$D4);
-
- &pand ($D0,$MASK); # -> base 2^26
- &psrlq ($D1,26);
- &psrldq ($D2,6);
- &pand ($D1,$MASK);
- &movdqa ($D3,$D2);
- &psrlq ($D2,4)
- &psrlq ($D3,30);
- &pand ($D2,$MASK);
- &pand ($D3,$MASK);
- &psrldq ($D4,13);
-
- &lea ("edx",&DWP(16*9,"esp")); # size optimization
- &mov ("ecx",2);
-&set_label("square");
- &movdqa (&QWP(16*0,"esp"),$D0);
- &movdqa (&QWP(16*1,"esp"),$D1);
- &movdqa (&QWP(16*2,"esp"),$D2);
- &movdqa (&QWP(16*3,"esp"),$D3);
- &movdqa (&QWP(16*4,"esp"),$D4);
-
- &movdqa ($T1,$D1);
- &movdqa ($T0,$D2);
- &pslld ($T1,2);
- &pslld ($T0,2);
- &paddd ($T1,$D1); # *5
- &paddd ($T0,$D2); # *5
- &movdqa (&QWP(16*5,"esp"),$T1);
- &movdqa (&QWP(16*6,"esp"),$T0);
- &movdqa ($T1,$D3);
- &movdqa ($T0,$D4);
- &pslld ($T1,2);
- &pslld ($T0,2);
- &paddd ($T1,$D3); # *5
- &paddd ($T0,$D4); # *5
- &movdqa (&QWP(16*7,"esp"),$T1);
- &movdqa (&QWP(16*8,"esp"),$T0);
-
- &pshufd ($T1,$D0,0b01000100);
- &movdqa ($T0,$D1);
- &pshufd ($D1,$D1,0b01000100);
- &pshufd ($D2,$D2,0b01000100);
- &pshufd ($D3,$D3,0b01000100);
- &pshufd ($D4,$D4,0b01000100);
- &movdqa (&QWP(16*0,"edx"),$T1);
- &movdqa (&QWP(16*1,"edx"),$D1);
- &movdqa (&QWP(16*2,"edx"),$D2);
- &movdqa (&QWP(16*3,"edx"),$D3);
- &movdqa (&QWP(16*4,"edx"),$D4);
-
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- &pmuludq ($D4,$D0); # h4*r0
- &pmuludq ($D3,$D0); # h3*r0
- &pmuludq ($D2,$D0); # h2*r0
- &pmuludq ($D1,$D0); # h1*r0
- &pmuludq ($D0,$T1); # h0*r0
-
-sub pmuladd {
-my $load = shift;
-my $base = shift; $base = "esp" if (!defined($base));
-
- ################################################################
- # As for choice to "rotate" $T0-$T2 in order to move paddq
- # past next multiplication. While it makes code harder to read
- # and doesn't have significant effect on most processors, it
- # makes a lot of difference on Atom, up to 30% improvement.
-
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2
- &paddq ($D4,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1
- &paddq ($D3,$T1);
- &$load ($T1,5); # s1
- &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0
- &paddq ($D2,$T2);
- &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4
- &$load ($T2,2); # r2^n
- &paddq ($D1,$T0);
-
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2
- &paddq ($D0,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1
- &paddq ($D4,$T2);
- &$load ($T2,6); # s2^n
- &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0
- &paddq ($D3,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4
- &paddq ($D2,$T1);
- &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3
- &$load ($T1,3); # r3^n
- &paddq ($D1,$T2);
-
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1
- &paddq ($D0,$T0);
- &$load ($T0,7); # s3^n
- &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0
- &paddq ($D4,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4
- &paddq ($D3,$T2);
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3
- &paddq ($D2,$T0);
- &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2
- &$load ($T0,4); # r4^n
- &paddq ($D1,$T1);
-
- &$load ($T1,8); # s4^n
- &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0
- &paddq ($D0,$T2);
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4
- &paddq ($D4,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1
- &paddq ($D3,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2
- &paddq ($D0,$T2);
- &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3
- &movdqa ($MASK,&QWP(64,"ebx"));
- &paddq ($D1,$T0);
- &paddq ($D2,$T1);
-}
- &pmuladd (sub { my ($reg,$i)=@_;
- &movdqa ($reg,&QWP(16*$i,"esp"));
- },"edx");
-
-sub lazy_reduction {
-my $extra = shift;
-my $paddx = defined($extra) ? paddq : paddd;
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- &movdqa ($T0,$D3);
- &pand ($D3,$MASK);
- &psrlq ($T0,26);
- &$extra () if (defined($extra));
- &paddq ($T0,$D4); # h3 -> h4
- &movdqa ($T1,$D0);
- &pand ($D0,$MASK);
- &psrlq ($T1,26);
- &movdqa ($D4,$T0);
- &paddq ($T1,$D1); # h0 -> h1
- &psrlq ($T0,26);
- &pand ($D4,$MASK);
- &movdqa ($D1,$T1);
- &psrlq ($T1,26);
- &paddd ($D0,$T0); # favour paddd when
- # possible, because
- # paddq is "broken"
- # on Atom
- &psllq ($T0,2);
- &paddq ($T1,$D2); # h1 -> h2
- &$paddx ($T0,$D0); # h4 -> h0
- &pand ($D1,$MASK);
- &movdqa ($D2,$T1);
- &psrlq ($T1,26);
- &pand ($D2,$MASK);
- &paddd ($T1,$D3); # h2 -> h3
- &movdqa ($D0,$T0);
- &psrlq ($T0,26);
- &movdqa ($D3,$T1);
- &psrlq ($T1,26);
- &pand ($D0,$MASK);
- &paddd ($D1,$T0); # h0 -> h1
- &pand ($D3,$MASK);
- &paddd ($D4,$T1); # h3 -> h4
-}
- &lazy_reduction ();
-
- &dec ("ecx");
- &jz (&label("square_break"));
-
- &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2
- &punpcklqdq ($D1,&QWP(16*1,"esp"));
- &punpcklqdq ($D2,&QWP(16*2,"esp"));
- &punpcklqdq ($D3,&QWP(16*3,"esp"));
- &punpcklqdq ($D4,&QWP(16*4,"esp"));
- &jmp (&label("square"));
-
-&set_label("square_break");
- &psllq ($D0,32); # -> r^3:0:r^4:0
- &psllq ($D1,32);
- &psllq ($D2,32);
- &psllq ($D3,32);
- &psllq ($D4,32);
- &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2
- &por ($D1,&QWP(16*1,"esp"));
- &por ($D2,&QWP(16*2,"esp"));
- &por ($D3,&QWP(16*3,"esp"));
- &por ($D4,&QWP(16*4,"esp"));
-
- &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4
- &pshufd ($D1,$D1,0b10001101);
- &pshufd ($D2,$D2,0b10001101);
- &pshufd ($D3,$D3,0b10001101);
- &pshufd ($D4,$D4,0b10001101);
-
- &movdqu (&QWP(16*0,"edi"),$D0); # save the table
- &movdqu (&QWP(16*1,"edi"),$D1);
- &movdqu (&QWP(16*2,"edi"),$D2);
- &movdqu (&QWP(16*3,"edi"),$D3);
- &movdqu (&QWP(16*4,"edi"),$D4);
-
- &movdqa ($T1,$D1);
- &movdqa ($T0,$D2);
- &pslld ($T1,2);
- &pslld ($T0,2);
- &paddd ($T1,$D1); # *5
- &paddd ($T0,$D2); # *5
- &movdqu (&QWP(16*5,"edi"),$T1);
- &movdqu (&QWP(16*6,"edi"),$T0);
- &movdqa ($T1,$D3);
- &movdqa ($T0,$D4);
- &pslld ($T1,2);
- &pslld ($T0,2);
- &paddd ($T1,$D3); # *5
- &paddd ($T0,$D4); # *5
- &movdqu (&QWP(16*7,"edi"),$T1);
- &movdqu (&QWP(16*8,"edi"),$T0);
-
- &mov ("esp","ebp");
- &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization
- &ret ();
-&function_end_B("_poly1305_init_sse2");
-
-&align (32);
-&function_begin("_poly1305_blocks_sse2");
- &mov ("edi",&wparam(0)); # ctx
- &mov ("esi",&wparam(1)); # inp
- &mov ("ecx",&wparam(2)); # len
-
- &mov ("eax",&DWP(4*5,"edi")); # is_base2_26
- &and ("ecx",-16);
- &jz (&label("nodata"));
- &cmp ("ecx",64);
- &jae (&label("enter_sse2"));
- &test ("eax","eax"); # is_base2_26?
- &jz (&label("enter_blocks"));
-
-&set_label("enter_sse2",16);
- &call (&label("pic_point"));
-&set_label("pic_point");
- &blindpop("ebx");
- &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
-
- &test ("eax","eax"); # is_base2_26?
- &jnz (&label("base2_26"));
-
- &call ("_poly1305_init_sse2");
-
- ################################################# base 2^32 -> base 2^26
- &mov ("eax",&DWP(0,"edi"));
- &mov ("ecx",&DWP(3,"edi"));
- &mov ("edx",&DWP(6,"edi"));
- &mov ("esi",&DWP(9,"edi"));
- &mov ("ebp",&DWP(13,"edi"));
- &mov (&DWP(4*5,"edi"),1); # is_base2_26
-
- &shr ("ecx",2);
- &and ("eax",0x3ffffff);
- &shr ("edx",4);
- &and ("ecx",0x3ffffff);
- &shr ("esi",6);
- &and ("edx",0x3ffffff);
-
- &movd ($D0,"eax");
- &movd ($D1,"ecx");
- &movd ($D2,"edx");
- &movd ($D3,"esi");
- &movd ($D4,"ebp");
-
- &mov ("esi",&wparam(1)); # [reload] inp
- &mov ("ecx",&wparam(2)); # [reload] len
- &jmp (&label("base2_32"));
-
-&set_label("base2_26",16);
- &movd ($D0,&DWP(4*0,"edi")); # load hash value
- &movd ($D1,&DWP(4*1,"edi"));
- &movd ($D2,&DWP(4*2,"edi"));
- &movd ($D3,&DWP(4*3,"edi"));
- &movd ($D4,&DWP(4*4,"edi"));
- &movdqa ($MASK,&QWP(64,"ebx"));
-
-&set_label("base2_32");
- &mov ("eax",&wparam(3)); # padbit
- &mov ("ebp","esp");
-
- &sub ("esp",16*(5+5+5+9+9));
- &and ("esp",-16);
-
- &lea ("edi",&DWP(16*3,"edi")); # size optimization
- &shl ("eax",24); # padbit
-
- &test ("ecx",31);
- &jz (&label("even"));
-
- ################################################################
- # process single block, with SSE2, because it's still faster
- # even though half of result is discarded
-
- &movdqu ($T1,&QWP(0,"esi")); # input
- &lea ("esi",&DWP(16,"esi"));
-
- &movdqa ($T0,$T1); # -> base 2^26 ...
- &pand ($T1,$MASK);
- &paddd ($D0,$T1); # ... and accumuate
-
- &movdqa ($T1,$T0);
- &psrlq ($T0,26);
- &psrldq ($T1,6);
- &pand ($T0,$MASK);
- &paddd ($D1,$T0);
-
- &movdqa ($T0,$T1);
- &psrlq ($T1,4);
- &pand ($T1,$MASK);
- &paddd ($D2,$T1);
-
- &movdqa ($T1,$T0);
- &psrlq ($T0,30);
- &pand ($T0,$MASK);
- &psrldq ($T1,7);
- &paddd ($D3,$T0);
-
- &movd ($T0,"eax"); # padbit
- &paddd ($D4,$T1);
- &movd ($T1,&DWP(16*0+12,"edi")); # r0
- &paddd ($D4,$T0);
-
- &movdqa (&QWP(16*0,"esp"),$D0);
- &movdqa (&QWP(16*1,"esp"),$D1);
- &movdqa (&QWP(16*2,"esp"),$D2);
- &movdqa (&QWP(16*3,"esp"),$D3);
- &movdqa (&QWP(16*4,"esp"),$D4);
-
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- &pmuludq ($D0,$T1); # h4*r0
- &pmuludq ($D1,$T1); # h3*r0
- &pmuludq ($D2,$T1); # h2*r0
- &movd ($T0,&DWP(16*1+12,"edi")); # r1
- &pmuludq ($D3,$T1); # h1*r0
- &pmuludq ($D4,$T1); # h0*r0
-
- &pmuladd (sub { my ($reg,$i)=@_;
- &movd ($reg,&DWP(16*$i+12,"edi"));
- });
-
- &lazy_reduction ();
-
- &sub ("ecx",16);
- &jz (&label("done"));
-
-&set_label("even");
- &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
- &lea ("eax",&DWP(-16*2,"esi"));
- &sub ("ecx",64);
-
- ################################################################
- # expand and copy pre-calculated table to stack
-
- &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4
- &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4
- &cmovb ("esi","eax");
- &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2
- &movdqa (&QWP(16*0,"edx"),$T1);
- &lea ("eax",&DWP(16*10,"esp"));
- &movdqu ($T1,&QWP(16*1,"edi"));
- &movdqa (&QWP(16*(0-9),"edx"),$T0);
- &pshufd ($T0,$T1,0b01000100);
- &pshufd ($T1,$T1,0b11101110);
- &movdqa (&QWP(16*1,"edx"),$T0);
- &movdqu ($T0,&QWP(16*2,"edi"));
- &movdqa (&QWP(16*(1-9),"edx"),$T1);
- &pshufd ($T1,$T0,0b01000100);
- &pshufd ($T0,$T0,0b11101110);
- &movdqa (&QWP(16*2,"edx"),$T1);
- &movdqu ($T1,&QWP(16*3,"edi"));
- &movdqa (&QWP(16*(2-9),"edx"),$T0);
- &pshufd ($T0,$T1,0b01000100);
- &pshufd ($T1,$T1,0b11101110);
- &movdqa (&QWP(16*3,"edx"),$T0);
- &movdqu ($T0,&QWP(16*4,"edi"));
- &movdqa (&QWP(16*(3-9),"edx"),$T1);
- &pshufd ($T1,$T0,0b01000100);
- &pshufd ($T0,$T0,0b11101110);
- &movdqa (&QWP(16*4,"edx"),$T1);
- &movdqu ($T1,&QWP(16*5,"edi"));
- &movdqa (&QWP(16*(4-9),"edx"),$T0);
- &pshufd ($T0,$T1,0b01000100);
- &pshufd ($T1,$T1,0b11101110);
- &movdqa (&QWP(16*5,"edx"),$T0);
- &movdqu ($T0,&QWP(16*6,"edi"));
- &movdqa (&QWP(16*(5-9),"edx"),$T1);
- &pshufd ($T1,$T0,0b01000100);
- &pshufd ($T0,$T0,0b11101110);
- &movdqa (&QWP(16*6,"edx"),$T1);
- &movdqu ($T1,&QWP(16*7,"edi"));
- &movdqa (&QWP(16*(6-9),"edx"),$T0);
- &pshufd ($T0,$T1,0b01000100);
- &pshufd ($T1,$T1,0b11101110);
- &movdqa (&QWP(16*7,"edx"),$T0);
- &movdqu ($T0,&QWP(16*8,"edi"));
- &movdqa (&QWP(16*(7-9),"edx"),$T1);
- &pshufd ($T1,$T0,0b01000100);
- &pshufd ($T0,$T0,0b11101110);
- &movdqa (&QWP(16*8,"edx"),$T1);
- &movdqa (&QWP(16*(8-9),"edx"),$T0);
-
-sub load_input {
-my ($inpbase,$offbase)=@_;
-
- &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input
- &movdqu ($T1,&QWP($inpbase+16,"esi"));
- &lea ("esi",&DWP(16*2,"esi"));
-
- &movdqa (&QWP($offbase+16*2,"esp"),$D2);
- &movdqa (&QWP($offbase+16*3,"esp"),$D3);
- &movdqa (&QWP($offbase+16*4,"esp"),$D4);
-
- &movdqa ($D2,$T0); # splat input
- &movdqa ($D3,$T1);
- &psrldq ($D2,6);
- &psrldq ($D3,6);
- &movdqa ($D4,$T0);
- &punpcklqdq ($D2,$D3); # 2:3
- &punpckhqdq ($D4,$T1); # 4
- &punpcklqdq ($T0,$T1); # 0:1
-
- &movdqa ($D3,$D2);
- &psrlq ($D2,4);
- &psrlq ($D3,30);
- &movdqa ($T1,$T0);
- &psrlq ($D4,40); # 4
- &psrlq ($T1,26);
- &pand ($T0,$MASK); # 0
- &pand ($T1,$MASK); # 1
- &pand ($D2,$MASK); # 2
- &pand ($D3,$MASK); # 3
- &por ($D4,&QWP(0,"ebx")); # padbit, yes, always
-
- &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase);
- &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase);
-}
- &load_input (16*2,16*5);
-
- &jbe (&label("skip_loop"));
- &jmp (&label("loop"));
-
-&set_label("loop",32);
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- ################################################################
-
- &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2
- &movdqa (&QWP(16*1,"eax"),$T1);
- &movdqa (&QWP(16*2,"eax"),$D2);
- &movdqa (&QWP(16*3,"eax"),$D3);
- &movdqa (&QWP(16*4,"eax"),$D4);
-
- ################################################################
- # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1
- # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
- # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2
- # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- &movdqa ($D1,$T0);
- &pmuludq ($T0,$T2); # h0*r0
- &movdqa ($D0,$T1);
- &pmuludq ($T1,$T2); # h1*r0
- &pmuludq ($D2,$T2); # h2*r0
- &pmuludq ($D3,$T2); # h3*r0
- &pmuludq ($D4,$T2); # h4*r0
-
-sub pmuladd_alt {
-my $addr = shift;
-
- &pmuludq ($D0,&$addr(8)); # h1*s4
- &movdqa ($T2,$D1);
- &pmuludq ($D1,&$addr(1)); # h0*r1
- &paddq ($D0,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&$addr(2)); # h0*r2
- &paddq ($D1,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&$addr(3)); # h0*r3
- &paddq ($D2,$T2);
- &movdqa ($T2,&QWP(16*1,"eax")); # pull h1
- &pmuludq ($T1,&$addr(4)); # h0*r4
- &paddq ($D3,$T0);
-
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&$addr(1)); # h1*r1
- &paddq ($D4,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&$addr(2)); # h1*r2
- &paddq ($D2,$T2);
- &movdqa ($T2,&QWP(16*2,"eax")); # pull h2
- &pmuludq ($T1,&$addr(3)); # h1*r3
- &paddq ($D3,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&$addr(7)); # h2*s3
- &paddq ($D4,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&$addr(8)); # h2*s4
- &paddq ($D0,$T2);
-
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&$addr(1)); # h2*r1
- &paddq ($D1,$T0);
- &movdqa ($T0,&QWP(16*3,"eax")); # pull h3
- &pmuludq ($T2,&$addr(2)); # h2*r2
- &paddq ($D3,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&$addr(6)); # h3*s2
- &paddq ($D4,$T2);
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&$addr(7)); # h3*s3
- &paddq ($D0,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&$addr(8)); # h3*s4
- &paddq ($D1,$T1);
-
- &movdqa ($T1,&QWP(16*4,"eax")); # pull h4
- &pmuludq ($T0,&$addr(1)); # h3*r1
- &paddq ($D2,$T2);
- &movdqa ($T2,$T1);
- &pmuludq ($T1,&$addr(8)); # h4*s4
- &paddq ($D4,$T0);
- &movdqa ($T0,$T2);
- &pmuludq ($T2,&$addr(5)); # h4*s1
- &paddq ($D3,$T1);
- &movdqa ($T1,$T0);
- &pmuludq ($T0,&$addr(6)); # h4*s2
- &paddq ($D0,$T2);
- &movdqa ($MASK,&QWP(64,"ebx"));
- &pmuludq ($T1,&$addr(7)); # h4*s3
- &paddq ($D1,$T0);
- &paddq ($D2,$T1);
-}
- &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); });
-
- &load_input (-16*2,0);
- &lea ("eax",&DWP(-16*2,"esi"));
- &sub ("ecx",64);
-
- &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value
- &paddd ($T1,&QWP(16*(5+1),"esp"));
- &paddd ($D2,&QWP(16*(5+2),"esp"));
- &paddd ($D3,&QWP(16*(5+3),"esp"));
- &paddd ($D4,&QWP(16*(5+4),"esp"));
-
- &cmovb ("esi","eax");
- &lea ("eax",&DWP(16*10,"esp"));
-
- &movdqa ($T2,&QWP(16*0,"edx")); # r0^4
- &movdqa (&QWP(16*1,"esp"),$D1);
- &movdqa (&QWP(16*1,"eax"),$T1);
- &movdqa (&QWP(16*2,"eax"),$D2);
- &movdqa (&QWP(16*3,"eax"),$D3);
- &movdqa (&QWP(16*4,"eax"),$D4);
-
- ################################################################
- # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1
- # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4
- # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
- # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2
- # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
- &movdqa ($D1,$T0);
- &pmuludq ($T0,$T2); # h0*r0
- &paddq ($T0,$D0);
- &movdqa ($D0,$T1);
- &pmuludq ($T1,$T2); # h1*r0
- &pmuludq ($D2,$T2); # h2*r0
- &pmuludq ($D3,$T2); # h3*r0
- &pmuludq ($D4,$T2); # h4*r0
-
- &paddq ($T1,&QWP(16*1,"esp"));
- &paddq ($D2,&QWP(16*2,"esp"));
- &paddq ($D3,&QWP(16*3,"esp"));
- &paddq ($D4,&QWP(16*4,"esp"));
-
- &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); });
-
- &lazy_reduction ();
-
- &load_input (16*2,16*5);
-
- &ja (&label("loop"));
-
-&set_label("skip_loop");
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
- &add ("ecx",32);
- &jnz (&label("long_tail"));
-
- &paddd ($T0,$D0); # add hash value
- &paddd ($T1,$D1);
- &paddd ($D2,&QWP(16*7,"esp"));
- &paddd ($D3,&QWP(16*8,"esp"));
- &paddd ($D4,&QWP(16*9,"esp"));
-
-&set_label("long_tail");
-
- &movdqa (&QWP(16*0,"eax"),$T0);
- &movdqa (&QWP(16*1,"eax"),$T1);
- &movdqa (&QWP(16*2,"eax"),$D2);
- &movdqa (&QWP(16*3,"eax"),$D3);
- &movdqa (&QWP(16*4,"eax"),$D4);
-
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- &pmuludq ($T0,$T2); # h0*r0
- &pmuludq ($T1,$T2); # h1*r0
- &pmuludq ($D2,$T2); # h2*r0
- &movdqa ($D0,$T0);
- &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
- &pmuludq ($D3,$T2); # h3*r0
- &movdqa ($D1,$T1);
- &pmuludq ($D4,$T2); # h4*r0
-
- &pmuladd (sub { my ($reg,$i)=@_;
- &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
- },"eax");
-
- &jz (&label("short_tail"));
-
- &load_input (-16*2,0);
-
- &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n
- &paddd ($T0,&QWP(16*5,"esp")); # add hash value
- &paddd ($T1,&QWP(16*6,"esp"));
- &paddd ($D2,&QWP(16*7,"esp"));
- &paddd ($D3,&QWP(16*8,"esp"));
- &paddd ($D4,&QWP(16*9,"esp"));
-
- ################################################################
- # multiply inp[0:1] by r^4:r^3 and accumulate
-
- &movdqa (&QWP(16*0,"esp"),$T0);
- &pmuludq ($T0,$T2); # h0*r0
- &movdqa (&QWP(16*1,"esp"),$T1);
- &pmuludq ($T1,$T2); # h1*r0
- &paddq ($D0,$T0);
- &movdqa ($T0,$D2);
- &pmuludq ($D2,$T2); # h2*r0
- &paddq ($D1,$T1);
- &movdqa ($T1,$D3);
- &pmuludq ($D3,$T2); # h3*r0
- &paddq ($D2,&QWP(16*2,"esp"));
- &movdqa (&QWP(16*2,"esp"),$T0);
- &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n
- &paddq ($D3,&QWP(16*3,"esp"));
- &movdqa (&QWP(16*3,"esp"),$T1);
- &movdqa ($T1,$D4);
- &pmuludq ($D4,$T2); # h4*r0
- &paddq ($D4,&QWP(16*4,"esp"));
- &movdqa (&QWP(16*4,"esp"),$T1);
-
- &pmuladd (sub { my ($reg,$i)=@_;
- &pshufd ($reg,&QWP(16*$i,"edx"),0x10);
- });
-
-&set_label("short_tail");
-
- ################################################################
- # horizontal addition
-
- &pshufd ($T1,$D4,0b01001110);
- &pshufd ($T0,$D3,0b01001110);
- &paddq ($D4,$T1);
- &paddq ($D3,$T0);
- &pshufd ($T1,$D0,0b01001110);
- &pshufd ($T0,$D1,0b01001110);
- &paddq ($D0,$T1);
- &paddq ($D1,$T0);
- &pshufd ($T1,$D2,0b01001110);
- #&paddq ($D2,$T1);
-
- &lazy_reduction (sub { &paddq ($D2,$T1) });
-
-&set_label("done");
- &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value
- &movd (&DWP(-16*3+4*1,"edi"),$D1);
- &movd (&DWP(-16*3+4*2,"edi"),$D2);
- &movd (&DWP(-16*3+4*3,"edi"),$D3);
- &movd (&DWP(-16*3+4*4,"edi"),$D4);
- &mov ("esp","ebp");
-&set_label("nodata");
-&function_end("_poly1305_blocks_sse2");
-
-&align (32);
-&function_begin("_poly1305_emit_sse2");
- &mov ("ebp",&wparam(0)); # context
-
- &cmp (&DWP(4*5,"ebp"),0); # is_base2_26?
- &je (&label("enter_emit"));
-
- &mov ("eax",&DWP(4*0,"ebp")); # load hash value
- &mov ("edi",&DWP(4*1,"ebp"));
- &mov ("ecx",&DWP(4*2,"ebp"));
- &mov ("edx",&DWP(4*3,"ebp"));
- &mov ("esi",&DWP(4*4,"ebp"));
-
- &mov ("ebx","edi"); # base 2^26 -> base 2^32
- &shl ("edi",26);
- &shr ("ebx",6);
- &add ("eax","edi");
- &mov ("edi","ecx");
- &adc ("ebx",0);
-
- &shl ("edi",20);
- &shr ("ecx",12);
- &add ("ebx","edi");
- &mov ("edi","edx");
- &adc ("ecx",0);
-
- &shl ("edi",14);
- &shr ("edx",18);
- &add ("ecx","edi");
- &mov ("edi","esi");
- &adc ("edx",0);
-
- &shl ("edi",8);
- &shr ("esi",24);
- &add ("edx","edi");
- &adc ("esi",0); # can be partially reduced
-
- &mov ("edi","esi"); # final reduction
- &and ("esi",3);
- &shr ("edi",2);
- &lea ("ebp",&DWP(0,"edi","edi",4)); # *5
- &mov ("edi",&wparam(1)); # output
- add ("eax","ebp");
- &mov ("ebp",&wparam(2)); # key
- adc ("ebx",0);
- adc ("ecx",0);
- adc ("edx",0);
-
- &movd ($D0,"eax"); # offload original hash value
- &add ("eax",5); # compare to modulus
- &movd ($D1,"ebx");
- &adc ("ebx",0);
- &movd ($D2,"ecx");
- &adc ("ecx",0);
- &movd ($D3,"edx");
- &adc ("edx",0);
- &adc ("esi",0);
- &shr ("esi",2); # did it carry/borrow?
-
- &neg ("esi"); # do we choose (hash-modulus) ...
- &and ("eax","esi");
- &and ("ebx","esi");
- &and ("ecx","esi");
- &and ("edx","esi");
- &mov (&DWP(4*0,"edi"),"eax");
- &movd ("eax",$D0);
- &mov (&DWP(4*1,"edi"),"ebx");
- &movd ("ebx",$D1);
- &mov (&DWP(4*2,"edi"),"ecx");
- &movd ("ecx",$D2);
- &mov (&DWP(4*3,"edi"),"edx");
- &movd ("edx",$D3);
-
- ¬ ("esi"); # ... or original hash value?
- &and ("eax","esi");
- &and ("ebx","esi");
- &or ("eax",&DWP(4*0,"edi"));
- &and ("ecx","esi");
- &or ("ebx",&DWP(4*1,"edi"));
- &and ("edx","esi");
- &or ("ecx",&DWP(4*2,"edi"));
- &or ("edx",&DWP(4*3,"edi"));
-
- &add ("eax",&DWP(4*0,"ebp")); # accumulate key
- &adc ("ebx",&DWP(4*1,"ebp"));
- &mov (&DWP(4*0,"edi"),"eax");
- &adc ("ecx",&DWP(4*2,"ebp"));
- &mov (&DWP(4*1,"edi"),"ebx");
- &adc ("edx",&DWP(4*3,"ebp"));
- &mov (&DWP(4*2,"edi"),"ecx");
- &mov (&DWP(4*3,"edi"),"edx");
-&function_end("_poly1305_emit_sse2");
-
-if ($avx>1) {
-########################################################################
-# Note that poly1305_init_avx2 operates on %xmm, I could have used
-# poly1305_init_sse2...
-
-&align (32);
-&function_begin_B("_poly1305_init_avx2");
- &vmovdqu ($D4,&QWP(4*6,"edi")); # key base 2^32
- &lea ("edi",&DWP(16*3,"edi")); # size optimization
- &mov ("ebp","esp");
- &sub ("esp",16*(9+5));
- &and ("esp",-16);
-
- #&vpand ($D4,$D4,&QWP(96,"ebx")); # magic mask
- &vmovdqa ($MASK,&QWP(64,"ebx"));
-
- &vpand ($D0,$D4,$MASK); # -> base 2^26
- &vpsrlq ($D1,$D4,26);
- &vpsrldq ($D3,$D4,6);
- &vpand ($D1,$D1,$MASK);
- &vpsrlq ($D2,$D3,4)
- &vpsrlq ($D3,$D3,30);
- &vpand ($D2,$D2,$MASK);
- &vpand ($D3,$D3,$MASK);
- &vpsrldq ($D4,$D4,13);
-
- &lea ("edx",&DWP(16*9,"esp")); # size optimization
- &mov ("ecx",2);
-&set_label("square");
- &vmovdqa (&QWP(16*0,"esp"),$D0);
- &vmovdqa (&QWP(16*1,"esp"),$D1);
- &vmovdqa (&QWP(16*2,"esp"),$D2);
- &vmovdqa (&QWP(16*3,"esp"),$D3);
- &vmovdqa (&QWP(16*4,"esp"),$D4);
-
- &vpslld ($T1,$D1,2);
- &vpslld ($T0,$D2,2);
- &vpaddd ($T1,$T1,$D1); # *5
- &vpaddd ($T0,$T0,$D2); # *5
- &vmovdqa (&QWP(16*5,"esp"),$T1);
- &vmovdqa (&QWP(16*6,"esp"),$T0);
- &vpslld ($T1,$D3,2);
- &vpslld ($T0,$D4,2);
- &vpaddd ($T1,$T1,$D3); # *5
- &vpaddd ($T0,$T0,$D4); # *5
- &vmovdqa (&QWP(16*7,"esp"),$T1);
- &vmovdqa (&QWP(16*8,"esp"),$T0);
-
- &vpshufd ($T0,$D0,0b01000100);
- &vmovdqa ($T1,$D1);
- &vpshufd ($D1,$D1,0b01000100);
- &vpshufd ($D2,$D2,0b01000100);
- &vpshufd ($D3,$D3,0b01000100);
- &vpshufd ($D4,$D4,0b01000100);
- &vmovdqa (&QWP(16*0,"edx"),$T0);
- &vmovdqa (&QWP(16*1,"edx"),$D1);
- &vmovdqa (&QWP(16*2,"edx"),$D2);
- &vmovdqa (&QWP(16*3,"edx"),$D3);
- &vmovdqa (&QWP(16*4,"edx"),$D4);
-
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- &vpmuludq ($D4,$D4,$D0); # h4*r0
- &vpmuludq ($D3,$D3,$D0); # h3*r0
- &vpmuludq ($D2,$D2,$D0); # h2*r0
- &vpmuludq ($D1,$D1,$D0); # h1*r0
- &vpmuludq ($D0,$T0,$D0); # h0*r0
-
- &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3
- &vpaddq ($D4,$D4,$T0);
- &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2
- &vpaddq ($D3,$D3,$T2);
- &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1
- &vpaddq ($D2,$D2,$T0);
- &vmovdqa ($T2,&QWP(16*5,"esp")); # s1
- &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0
- &vpaddq ($D1,$D1,$T1);
- &vmovdqa ($T0,&QWP(16*2,"esp")); # r2
- &vpmuludq ($T2,$T2,&QWP(16*4,"edx")); # s1*h4
- &vpaddq ($D0,$D0,$T2);
-
- &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2
- &vpaddq ($D4,$D4,$T1);
- &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1
- &vpaddq ($D3,$D3,$T2);
- &vmovdqa ($T1,&QWP(16*6,"esp")); # s2
- &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0
- &vpaddq ($D2,$D2,$T0);
- &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4
- &vpaddq ($D1,$D1,$T2);
- &vmovdqa ($T0,&QWP(16*3,"esp")); # r3
- &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3
- &vpaddq ($D0,$D0,$T1);
-
- &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1
- &vpaddq ($D4,$D4,$T2);
- &vmovdqa ($T1,&QWP(16*7,"esp")); # s3
- &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0
- &vpaddq ($D3,$D3,$T0);
- &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4
- &vpaddq ($D2,$D2,$T2);
- &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3
- &vpaddq ($D1,$D1,$T0);
- &vmovdqa ($T2,&QWP(16*4,"esp")); # r4
- &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2
- &vpaddq ($D0,$D0,$T1);
-
- &vmovdqa ($T0,&QWP(16*8,"esp")); # s4
- &vpmuludq ($T2,$T2,&QWP(16*0,"edx")); # r4*h0
- &vpaddq ($D4,$D4,$T2);
- &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4
- &vpaddq ($D3,$D3,$T1);
- &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1
- &vpaddq ($D0,$D0,$T2);
- &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2
- &vpaddq ($D1,$D1,$T1);
- &vmovdqa ($MASK,&QWP(64,"ebx"));
- &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3
- &vpaddq ($D2,$D2,$T0);
-
- ################################################################
- # lazy reduction
- &vpsrlq ($T0,$D3,26);
- &vpand ($D3,$D3,$MASK);
- &vpsrlq ($T1,$D0,26);
- &vpand ($D0,$D0,$MASK);
- &vpaddq ($D4,$D4,$T0); # h3 -> h4
- &vpaddq ($D1,$D1,$T1); # h0 -> h1
- &vpsrlq ($T0,$D4,26);
- &vpand ($D4,$D4,$MASK);
- &vpsrlq ($T1,$D1,26);
- &vpand ($D1,$D1,$MASK);
- &vpaddq ($D2,$D2,$T1); # h1 -> h2
- &vpaddd ($D0,$D0,$T0);
- &vpsllq ($T0,$T0,2);
- &vpsrlq ($T1,$D2,26);
- &vpand ($D2,$D2,$MASK);
- &vpaddd ($D0,$D0,$T0); # h4 -> h0
- &vpaddd ($D3,$D3,$T1); # h2 -> h3
- &vpsrlq ($T1,$D3,26);
- &vpsrlq ($T0,$D0,26);
- &vpand ($D0,$D0,$MASK);
- &vpand ($D3,$D3,$MASK);
- &vpaddd ($D1,$D1,$T0); # h0 -> h1
- &vpaddd ($D4,$D4,$T1); # h3 -> h4
-
- &dec ("ecx");
- &jz (&label("square_break"));
-
- &vpunpcklqdq ($D0,$D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2
- &vpunpcklqdq ($D1,$D1,&QWP(16*1,"esp"));
- &vpunpcklqdq ($D2,$D2,&QWP(16*2,"esp"));
- &vpunpcklqdq ($D3,$D3,&QWP(16*3,"esp"));
- &vpunpcklqdq ($D4,$D4,&QWP(16*4,"esp"));
- &jmp (&label("square"));
-
-&set_label("square_break");
- &vpsllq ($D0,$D0,32); # -> r^3:0:r^4:0
- &vpsllq ($D1,$D1,32);
- &vpsllq ($D2,$D2,32);
- &vpsllq ($D3,$D3,32);
- &vpsllq ($D4,$D4,32);
- &vpor ($D0,$D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2
- &vpor ($D1,$D1,&QWP(16*1,"esp"));
- &vpor ($D2,$D2,&QWP(16*2,"esp"));
- &vpor ($D3,$D3,&QWP(16*3,"esp"));
- &vpor ($D4,$D4,&QWP(16*4,"esp"));
-
- &vpshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4
- &vpshufd ($D1,$D1,0b10001101);
- &vpshufd ($D2,$D2,0b10001101);
- &vpshufd ($D3,$D3,0b10001101);
- &vpshufd ($D4,$D4,0b10001101);
-
- &vmovdqu (&QWP(16*0,"edi"),$D0); # save the table
- &vmovdqu (&QWP(16*1,"edi"),$D1);
- &vmovdqu (&QWP(16*2,"edi"),$D2);
- &vmovdqu (&QWP(16*3,"edi"),$D3);
- &vmovdqu (&QWP(16*4,"edi"),$D4);
-
- &vpslld ($T1,$D1,2);
- &vpslld ($T0,$D2,2);
- &vpaddd ($T1,$T1,$D1); # *5
- &vpaddd ($T0,$T0,$D2); # *5
- &vmovdqu (&QWP(16*5,"edi"),$T1);
- &vmovdqu (&QWP(16*6,"edi"),$T0);
- &vpslld ($T1,$D3,2);
- &vpslld ($T0,$D4,2);
- &vpaddd ($T1,$T1,$D3); # *5
- &vpaddd ($T0,$T0,$D4); # *5
- &vmovdqu (&QWP(16*7,"edi"),$T1);
- &vmovdqu (&QWP(16*8,"edi"),$T0);
-
- &mov ("esp","ebp");
- &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization
- &ret ();
-&function_end_B("_poly1305_init_avx2");
-
-########################################################################
-# now it's time to switch to %ymm
-
-my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
-my $MASK=$T2;
-
-sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
-
-&align (32);
-&function_begin("_poly1305_blocks_avx2");
- &mov ("edi",&wparam(0)); # ctx
- &mov ("esi",&wparam(1)); # inp
- &mov ("ecx",&wparam(2)); # len
-
- &mov ("eax",&DWP(4*5,"edi")); # is_base2_26
- &and ("ecx",-16);
- &jz (&label("nodata"));
- &cmp ("ecx",64);
- &jae (&label("enter_avx2"));
- &test ("eax","eax"); # is_base2_26?
- &jz (&label("enter_blocks"));
-
-&set_label("enter_avx2");
- &vzeroupper ();
-
- &call (&label("pic_point"));
-&set_label("pic_point");
- &blindpop("ebx");
- &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
-
- &test ("eax","eax"); # is_base2_26?
- &jnz (&label("base2_26"));
-
- &call ("_poly1305_init_avx2");
-
- ################################################# base 2^32 -> base 2^26
- &mov ("eax",&DWP(0,"edi"));
- &mov ("ecx",&DWP(3,"edi"));
- &mov ("edx",&DWP(6,"edi"));
- &mov ("esi",&DWP(9,"edi"));
- &mov ("ebp",&DWP(13,"edi"));
-
- &shr ("ecx",2);
- &and ("eax",0x3ffffff);
- &shr ("edx",4);
- &and ("ecx",0x3ffffff);
- &shr ("esi",6);
- &and ("edx",0x3ffffff);
-
- &mov (&DWP(4*0,"edi"),"eax");
- &mov (&DWP(4*1,"edi"),"ecx");
- &mov (&DWP(4*2,"edi"),"edx");
- &mov (&DWP(4*3,"edi"),"esi");
- &mov (&DWP(4*4,"edi"),"ebp");
- &mov (&DWP(4*5,"edi"),1); # is_base2_26
-
- &mov ("esi",&wparam(1)); # [reload] inp
- &mov ("ecx",&wparam(2)); # [reload] len
-
-&set_label("base2_26");
- &mov ("eax",&wparam(3)); # padbit
- &mov ("ebp","esp");
-
- &sub ("esp",32*(5+9));
- &and ("esp",-512); # ensure that frame
- # doesn't cross page
- # boundary, which is
- # essential for
- # misaligned 32-byte
- # loads
-
- ################################################################
- # expand and copy pre-calculated table to stack
-
- &vmovdqu (&X($D0),&QWP(16*(3+0),"edi"));
- &lea ("edx",&DWP(32*5+128,"esp")); # +128 size optimization
- &vmovdqu (&X($D1),&QWP(16*(3+1),"edi"));
- &vmovdqu (&X($D2),&QWP(16*(3+2),"edi"));
- &vmovdqu (&X($D3),&QWP(16*(3+3),"edi"));
- &vmovdqu (&X($D4),&QWP(16*(3+4),"edi"));
- &lea ("edi",&DWP(16*3,"edi")); # size optimization
- &vpermq ($D0,$D0,0b01000000); # 00001234 -> 12343434
- &vpermq ($D1,$D1,0b01000000);
- &vpermq ($D2,$D2,0b01000000);
- &vpermq ($D3,$D3,0b01000000);
- &vpermq ($D4,$D4,0b01000000);
- &vpshufd ($D0,$D0,0b11001000); # 12343434 -> 14243444
- &vpshufd ($D1,$D1,0b11001000);
- &vpshufd ($D2,$D2,0b11001000);
- &vpshufd ($D3,$D3,0b11001000);
- &vpshufd ($D4,$D4,0b11001000);
- &vmovdqa (&QWP(32*0-128,"edx"),$D0);
- &vmovdqu (&X($D0),&QWP(16*5,"edi"));
- &vmovdqa (&QWP(32*1-128,"edx"),$D1);
- &vmovdqu (&X($D1),&QWP(16*6,"edi"));
- &vmovdqa (&QWP(32*2-128,"edx"),$D2);
- &vmovdqu (&X($D2),&QWP(16*7,"edi"));
- &vmovdqa (&QWP(32*3-128,"edx"),$D3);
- &vmovdqu (&X($D3),&QWP(16*8,"edi"));
- &vmovdqa (&QWP(32*4-128,"edx"),$D4);
- &vpermq ($D0,$D0,0b01000000);
- &vpermq ($D1,$D1,0b01000000);
- &vpermq ($D2,$D2,0b01000000);
- &vpermq ($D3,$D3,0b01000000);
- &vpshufd ($D0,$D0,0b11001000);
- &vpshufd ($D1,$D1,0b11001000);
- &vpshufd ($D2,$D2,0b11001000);
- &vpshufd ($D3,$D3,0b11001000);
- &vmovdqa (&QWP(32*5-128,"edx"),$D0);
- &vmovd (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
- &vmovdqa (&QWP(32*6-128,"edx"),$D1);
- &vmovd (&X($D1),&DWP(-16*3+4*1,"edi"));
- &vmovdqa (&QWP(32*7-128,"edx"),$D2);
- &vmovd (&X($D2),&DWP(-16*3+4*2,"edi"));
- &vmovdqa (&QWP(32*8-128,"edx"),$D3);
- &vmovd (&X($D3),&DWP(-16*3+4*3,"edi"));
- &vmovd (&X($D4),&DWP(-16*3+4*4,"edi"));
- &vmovdqa ($MASK,&QWP(64,"ebx"));
- &neg ("eax"); # padbit
-
- &test ("ecx",63);
- &jz (&label("even"));
-
- &mov ("edx","ecx");
- &and ("ecx",-64);
- &and ("edx",63);
-
- &vmovdqu (&X($T0),&QWP(16*0,"esi"));
- &cmp ("edx",32);
- &jb (&label("one"));
-
- &vmovdqu (&X($T1),&QWP(16*1,"esi"));
- &je (&label("two"));
-
- &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
- &lea ("esi",&DWP(16*3,"esi"));
- &lea ("ebx",&DWP(8,"ebx")); # three padbits
- &lea ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*)
- &jmp (&label("tail"));
-
-&set_label("two");
- &lea ("esi",&DWP(16*2,"esi"));
- &lea ("ebx",&DWP(16,"ebx")); # two padbits
- &lea ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
- &jmp (&label("tail"));
-
-&set_label("one");
- &lea ("esi",&DWP(16*1,"esi"));
- &vpxor ($T1,$T1,$T1);
- &lea ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits
- &lea ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
- &jmp (&label("tail"));
-
-# (*) spots marked with '--' are data from next table entry, but they
-# are multiplied by 0 and therefore rendered insignificant
-
-&set_label("even",32);
- &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input
- &vmovdqu (&X($T1),&QWP(16*1,"esi"));
- &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
- &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
- &lea ("esi",&DWP(16*4,"esi"));
- &sub ("ecx",64);
- &jz (&label("tail"));
-
-&set_label("loop");
- ################################################################
- # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
- # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
- # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
- # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
- # \________/ \_______/
- ################################################################
-
-sub vsplat_input {
- &vmovdqa (&QWP(32*2,"esp"),$D2);
- &vpsrldq ($D2,$T0,6); # splat input
- &vmovdqa (&QWP(32*0,"esp"),$D0);
- &vpsrldq ($D0,$T1,6);
- &vmovdqa (&QWP(32*1,"esp"),$D1);
- &vpunpckhqdq ($D1,$T0,$T1); # 4
- &vpunpcklqdq ($T0,$T0,$T1); # 0:1
- &vpunpcklqdq ($D2,$D2,$D0); # 2:3
-
- &vpsrlq ($D0,$D2,30);
- &vpsrlq ($D2,$D2,4);
- &vpsrlq ($T1,$T0,26);
- &vpsrlq ($D1,$D1,40); # 4
- &vpand ($D2,$D2,$MASK); # 2
- &vpand ($T0,$T0,$MASK); # 0
- &vpand ($T1,$T1,$MASK); # 1
- &vpand ($D0,$D0,$MASK); # 3 (*)
- &vpor ($D1,$D1,&QWP(0,"ebx")); # padbit, yes, always
-
- # (*) note that output is counterintuitive, inp[3:4] is
- # returned in $D1-2, while $D3-4 are preserved;
-}
- &vsplat_input ();
-
-sub vpmuladd {
-my $addr = shift;
-
- &vpaddq ($D2,$D2,&QWP(32*2,"esp")); # add hash value
- &vpaddq ($T0,$T0,&QWP(32*0,"esp"));
- &vpaddq ($T1,$T1,&QWP(32*1,"esp"));
- &vpaddq ($D0,$D0,$D3);
- &vpaddq ($D1,$D1,$D4);
-
- ################################################################
- # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
- # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
- # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
- # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
- # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
-
- &vpmuludq ($D3,$D2,&$addr(1)); # d3 = h2*r1
- &vmovdqa (QWP(32*1,"esp"),$T1);
- &vpmuludq ($D4,$D2,&$addr(2)); # d4 = h2*r2
- &vmovdqa (QWP(32*3,"esp"),$D0);
- &vpmuludq ($D0,$D2,&$addr(7)); # d0 = h2*s3
- &vmovdqa (QWP(32*4,"esp"),$D1);
- &vpmuludq ($D1,$D2,&$addr(8)); # d1 = h2*s4
- &vpmuludq ($D2,$D2,&$addr(0)); # d2 = h2*r0
-
- &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3
- &vpaddq ($D3,$D3,$T2); # d3 += h0*r3
- &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4
- &vpaddq ($D4,$D4,$T1); # d4 + h0*r4
- &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0
- &vpaddq ($D0,$D0,$T2); # d0 + h0*r0
- &vmovdqa ($T2,&QWP(32*1,"esp")); # h1
- &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1
- &vpaddq ($D1,$D1,$T1); # d1 += h0*r1
- &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2
- &vpaddq ($D2,$D2,$T0); # d2 += h0*r2
-
- &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2
- &vpaddq ($D3,$D3,$T1); # d3 += h1*r2
- &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3
- &vpaddq ($D4,$D4,$T0); # d4 += h1*r3
- &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4
- &vpaddq ($D0,$D0,$T1); # d0 += h1*s4
- &vmovdqa ($T1,&QWP(32*3,"esp")); # h3
- &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0
- &vpaddq ($D1,$D1,$T0); # d1 += h1*r0
- &vpmuludq ($T2,$T2,&$addr(1)); # h1*r1
- &vpaddq ($D2,$D2,$T2); # d2 += h1*r1
-
- &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0
- &vpaddq ($D3,$D3,$T0); # d3 += h3*r0
- &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1
- &vpaddq ($D4,$D4,$T2); # d4 += h3*r1
- &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2
- &vpaddq ($D0,$D0,$T0); # d0 += h3*s2
- &vmovdqa ($T0,&QWP(32*4,"esp")); # h4
- &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3
- &vpaddq ($D1,$D1,$T2); # d1+= h3*s3
- &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4
- &vpaddq ($D2,$D2,$T1); # d2 += h3*s4
-
- &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4
- &vpaddq ($D3,$D3,$T2); # d3 += h4*s4
- &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1
- &vpaddq ($D0,$D0,$T1); # d0 += h4*s1
- &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0
- &vpaddq ($D4,$D4,$T2); # d4 += h4*r0
- &vmovdqa ($MASK,&QWP(64,"ebx"));
- &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2
- &vpaddq ($D1,$D1,$T1); # d1 += h4*s2
- &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3
- &vpaddq ($D2,$D2,$T0); # d2 += h4*s3
-}
- &vpmuladd (sub { my $i=shift; &QWP(32*$i-128,"edx"); });
-
-sub vlazy_reduction {
- ################################################################
- # lazy reduction
-
- &vpsrlq ($T0,$D3,26);
- &vpand ($D3,$D3,$MASK);
- &vpsrlq ($T1,$D0,26);
- &vpand ($D0,$D0,$MASK);
- &vpaddq ($D4,$D4,$T0); # h3 -> h4
- &vpaddq ($D1,$D1,$T1); # h0 -> h1
- &vpsrlq ($T0,$D4,26);
- &vpand ($D4,$D4,$MASK);
- &vpsrlq ($T1,$D1,26);
- &vpand ($D1,$D1,$MASK);
- &vpaddq ($D2,$D2,$T1); # h1 -> h2
- &vpaddq ($D0,$D0,$T0);
- &vpsllq ($T0,$T0,2);
- &vpsrlq ($T1,$D2,26);
- &vpand ($D2,$D2,$MASK);
- &vpaddq ($D0,$D0,$T0); # h4 -> h0
- &vpaddq ($D3,$D3,$T1); # h2 -> h3
- &vpsrlq ($T1,$D3,26);
- &vpsrlq ($T0,$D0,26);
- &vpand ($D0,$D0,$MASK);
- &vpand ($D3,$D3,$MASK);
- &vpaddq ($D1,$D1,$T0); # h0 -> h1
- &vpaddq ($D4,$D4,$T1); # h3 -> h4
-}
- &vlazy_reduction();
-
- &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input
- &vmovdqu (&X($T1),&QWP(16*1,"esi"));
- &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1);
- &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1);
- &lea ("esi",&DWP(16*4,"esi"));
- &sub ("ecx",64);
- &jnz (&label("loop"));
-
-&set_label("tail");
- &vsplat_input ();
- &and ("ebx",-64); # restore pointer
-
- &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); });
-
- ################################################################
- # horizontal addition
-
- &vpsrldq ($T0,$D4,8);
- &vpsrldq ($T1,$D3,8);
- &vpaddq ($D4,$D4,$T0);
- &vpsrldq ($T0,$D0,8);
- &vpaddq ($D3,$D3,$T1);
- &vpsrldq ($T1,$D1,8);
- &vpaddq ($D0,$D0,$T0);
- &vpsrldq ($T0,$D2,8);
- &vpaddq ($D1,$D1,$T1);
- &vpermq ($T1,$D4,2); # keep folding
- &vpaddq ($D2,$D2,$T0);
- &vpermq ($T0,$D3,2);
- &vpaddq ($D4,$D4,$T1);
- &vpermq ($T1,$D0,2);
- &vpaddq ($D3,$D3,$T0);
- &vpermq ($T0,$D1,2);
- &vpaddq ($D0,$D0,$T1);
- &vpermq ($T1,$D2,2);
- &vpaddq ($D1,$D1,$T0);
- &vpaddq ($D2,$D2,$T1);
-
- &vlazy_reduction();
-
- &cmp ("ecx",0);
- &je (&label("done"));
-
- ################################################################
- # clear all but single word
-
- &vpshufd (&X($D0),&X($D0),0b11111100);
- &lea ("edx",&DWP(32*5+128,"esp")); # restore pointer
- &vpshufd (&X($D1),&X($D1),0b11111100);
- &vpshufd (&X($D2),&X($D2),0b11111100);
- &vpshufd (&X($D3),&X($D3),0b11111100);
- &vpshufd (&X($D4),&X($D4),0b11111100);
- &jmp (&label("even"));
-
-&set_label("done",16);
- &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
- &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1));
- &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2));
- &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3));
- &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4));
- &vzeroupper ();
- &mov ("esp","ebp");
-&set_label("nodata");
-&function_end("_poly1305_blocks_avx2");
-}
-&set_label("const_sse2",64);
- &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0);
- &data_word(0,0, 0,0, 0,0, 0,0);
- &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0);
- &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
-}
-&asciz ("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
-&align (4);
-
-&asm_finish();
-
-close STDOUT;
diff --git a/src/crypto/poly1305/asm/poly1305-x86_64.pl b/src/crypto/poly1305/asm/poly1305-x86_64.pl
deleted file mode 100755
index 3630b47..0000000
--- a/src/crypto/poly1305/asm/poly1305-x86_64.pl
+++ /dev/null
@@ -1,2235 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-# IALU/gcc-4.8(*) AVX(**) AVX2
-# P4 4.90/+120% -
-# Core 2 2.39/+90% -
-# Westmere 1.86/+120% -
-# Sandy Bridge 1.39/+140% 1.10
-# Haswell 1.10/+175% 1.11 0.65
-# Skylake 1.12/+120% 0.96 0.51
-# Silvermont 2.83/+95% -
-# VIA Nano 1.82/+150% -
-# Sledgehammer 1.38/+160% -
-# Bulldozer 2.21/+130% 0.97
-#
-# (*) improvement coefficients relative to clang are more modest and
-# are ~50% on most processors, in both cases we are comparing to
-# __int128 code;
-# (**) SSE2 implementation was attempted, but among non-AVX processors
-# it was faster than integer-only code only on older Intel P4 and
-# Core processors, 50-30%, less newer processor is, but slower on
-# contemporary ones, for example almost 2x slower on Atom, and as
-# former are naturally disappearing, SSE2 is deemed unnecessary;
-
-$flavour = shift;
-$output = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-$avx = 2;
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len); # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
-
-sub poly1305_iteration {
-# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output: $h0-$h2 *= $r0-$r1
-$code.=<<___;
- mulq $h0 # h0*r1
- mov %rax,$d2
- mov $r0,%rax
- mov %rdx,$d3
-
- mulq $h0 # h0*r0
- mov %rax,$h0 # future $h0
- mov $r0,%rax
- mov %rdx,$d1
-
- mulq $h1 # h1*r0
- add %rax,$d2
- mov $s1,%rax
- adc %rdx,$d3
-
- mulq $h1 # h1*s1
- mov $h2,$h1 # borrow $h1
- add %rax,$h0
- adc %rdx,$d1
-
- imulq $s1,$h1 # h2*s1
- add $h1,$d2
- mov $d1,$h1
- adc \$0,$d3
-
- imulq $r0,$h2 # h2*r0
- add $d2,$h1
- mov \$-4,%rax # mask value
- adc $h2,$d3
-
- and $d3,%rax # last reduction step
- mov $d3,$h2
- shr \$2,$d3
- and \$3,$h2
- add $d3,%rax
- add %rax,$h0
- adc \$0,$h1
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int64 h[3]; # current hash value base 2^64
-# unsigned __int64 r[2]; # key value base 2^64
-
-$code.=<<___;
-.text
-
-.extern OPENSSL_ia32cap_P
-
-.globl poly1305_init
-.globl poly1305_blocks
-.globl poly1305_emit
-.type poly1305_init,\@function,3
-.align 32
-poly1305_init:
- xor %rax,%rax
- mov %rax,0($ctx) # initialize hash value
- mov %rax,8($ctx)
- mov %rax,16($ctx)
-
- cmp \$0,$inp
- je .Lno_key
-
- lea poly1305_blocks(%rip),%r10
- lea poly1305_emit(%rip),%r11
-___
-$code.=<<___ if ($avx);
- mov OPENSSL_ia32cap_P+4(%rip),%r9
- lea poly1305_blocks_avx(%rip),%rax
- lea poly1305_emit_avx(%rip),%rcx
- bt \$`60-32`,%r9 # AVX?
- cmovc %rax,%r10
- cmovc %rcx,%r11
-___
-$code.=<<___ if ($avx>1);
- lea poly1305_blocks_avx2(%rip),%rax
- bt \$`5+32`,%r9 # AVX2?
- cmovc %rax,%r10
-___
-$code.=<<___;
- mov \$0x0ffffffc0fffffff,%rax
- mov \$0x0ffffffc0ffffffc,%rcx
- and 0($inp),%rax
- and 8($inp),%rcx
- mov %rax,24($ctx)
- mov %rcx,32($ctx)
-___
-$code.=<<___ if ($flavour !~ /elf32/);
- mov %r10,0(%rdx)
- mov %r11,8(%rdx)
-___
-$code.=<<___ if ($flavour =~ /elf32/);
- mov %r10d,0(%rdx)
- mov %r11d,4(%rdx)
-___
-$code.=<<___;
- mov \$1,%eax
-.Lno_key:
- ret
-.size poly1305_init,.-poly1305_init
-
-.type poly1305_blocks,\@function,4
-.align 32
-poly1305_blocks:
-.Lblocks:
- sub \$16,$len # too short?
- jc .Lno_data
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-.Lblocks_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2
-
- mov $s1,$r1
- shr \$2,$s1
- mov $r1,%rax
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
- jmp .Loop
-
-.align 32
-.Loop:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-___
- &poly1305_iteration();
-$code.=<<___;
- mov $r1,%rax
- sub \$16,%r15 # len-=16
- jnc .Loop
-
- mov $h0,0($ctx) # store hash value
- mov $h1,8($ctx)
- mov $h2,16($ctx)
-
- mov 0(%rsp),%r15
- mov 8(%rsp),%r14
- mov 16(%rsp),%r13
- mov 24(%rsp),%r12
- mov 32(%rsp),%rbp
- mov 40(%rsp),%rbx
- lea 48(%rsp),%rsp
-.Lno_data:
-.Lblocks_epilogue:
- ret
-.size poly1305_blocks,.-poly1305_blocks
-
-.type poly1305_emit,\@function,3
-.align 32
-poly1305_emit:
-.Lemit:
- mov 0($ctx),%r8 # load hash value
- mov 8($ctx),%r9
- mov 16($ctx),%r10
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overfow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- ret
-.size poly1305_emit,.-poly1305_emit
-___
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-# unsigned __int32 h[5]; # current hash value base 2^26
-# unsigned __int32 is_base2_26;
-# unsigned __int64 r[2]; # key value base 2^64
-# unsigned __int64 pad;
-# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
- map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type __poly1305_block,\@abi-omnipotent
-.align 32
-__poly1305_block:
-___
- &poly1305_iteration();
-$code.=<<___;
- ret
-.size __poly1305_block,.-__poly1305_block
-
-.type __poly1305_init_avx,\@abi-omnipotent
-.align 32
-__poly1305_init_avx:
- mov $r0,$h0
- mov $r1,$h1
- xor $h2,$h2
-
- lea 48+64($ctx),$ctx # size optimization
-
- mov $r1,%rax
- call __poly1305_block # r^2
-
- mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
- mov \$0x3ffffff,%edx
- mov $h0,$d1
- and $h0#d,%eax
- mov $r0,$d2
- and $r0#d,%edx
- mov %eax,`16*0+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*0+4-64`($ctx)
- shr \$26,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*1+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*1+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*2+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*2+4-64`($ctx)
- shr \$26,$d2
-
- mov $h1,%rax
- mov $r1,%rdx
- shl \$12,%rax
- shl \$12,%rdx
- or $d1,%rax
- or $d2,%rdx
- and \$0x3ffffff,%eax
- and \$0x3ffffff,%edx
- mov %eax,`16*3+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*3+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*4+0-64`($ctx)
- mov $h1,$d1
- mov %edx,`16*4+4-64`($ctx)
- mov $r1,$d2
-
- mov \$0x3ffffff,%eax
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- shr \$14,$d2
- and $d1#d,%eax
- and $d2#d,%edx
- mov %eax,`16*5+0-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov %edx,`16*5+4-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- mov %eax,`16*6+0-64`($ctx)
- shr \$26,$d1
- mov %edx,`16*6+4-64`($ctx)
- shr \$26,$d2
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+0-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d2#d,`16*7+4-64`($ctx)
- lea ($d2,$d2,4),$d2 # *5
- mov $d1#d,`16*8+0-64`($ctx)
- mov $d2#d,`16*8+4-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^3
-
- mov \$0x3ffffff,%eax # save r^3 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+12-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+12-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+12-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+12-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+12-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+12-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+12-64`($ctx)
-
- mov $r1,%rax
- call __poly1305_block # r^4
-
- mov \$0x3ffffff,%eax # save r^4 base 2^26
- mov $h0,$d1
- and $h0#d,%eax
- shr \$26,$d1
- mov %eax,`16*0+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- and $d1#d,%edx
- mov %edx,`16*1+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*2+8-64`($ctx)
-
- mov $h1,%rax
- shl \$12,%rax
- or $d1,%rax
- and \$0x3ffffff,%eax
- mov %eax,`16*3+8-64`($ctx)
- lea (%rax,%rax,4),%eax # *5
- mov $h1,$d1
- mov %eax,`16*4+8-64`($ctx)
-
- mov \$0x3ffffff,%edx
- shr \$14,$d1
- and $d1#d,%edx
- mov %edx,`16*5+8-64`($ctx)
- lea (%rdx,%rdx,4),%edx # *5
- shr \$26,$d1
- mov %edx,`16*6+8-64`($ctx)
-
- mov $h2,%rax
- shl \$24,%rax
- or %rax,$d1
- mov $d1#d,`16*7+8-64`($ctx)
- lea ($d1,$d1,4),$d1 # *5
- mov $d1#d,`16*8+8-64`($ctx)
-
- lea -48-64($ctx),$ctx # size [de-]optimization
- ret
-.size __poly1305_init_avx,.-__poly1305_init_avx
-
-.type poly1305_blocks_avx,\@function,4
-.align 32
-poly1305_blocks_avx:
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx:
- and \$-16,$len
- jz .Lno_data_avx
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx
-
- test \$31,$len
- jz .Leven_avx
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-.Lblocks_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$-1<<31,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$-1<<31,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
-
- call __poly1305_block
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- sub \$16,%r15
- jz .Lstore_base2_26_avx
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- jmp .Lproceed_avx
-
-.align 32
-.Lstore_base2_64_avx:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx
-
-.align 16
-.Lstore_base2_26_avx:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx:
- mov 0(%rsp),%r15
- mov 8(%rsp),%r14
- mov 16(%rsp),%r13
- mov 24(%rsp),%r12
- mov 32(%rsp),%rbp
- mov 40(%rsp),%rbx
- lea 48(%rsp),%rsp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
- ret
-
-.align 32
-.Lbase2_64_avx:
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-.Lbase2_64_avx_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$31,$len
- jz .Linit_avx
-
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
-
-.Linit_avx:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,$H0
- vmovd %rdx#d,$H1
- vmovd $h0#d,$H2
- vmovd $h1#d,$H3
- vmovd $h2#d,$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx:
- mov %r15,$len
-
- mov 0(%rsp),%r15
- mov 8(%rsp),%r14
- mov 16(%rsp),%r13
- mov 24(%rsp),%r12
- mov 32(%rsp),%rbp
- mov 40(%rsp),%rbx
- lea 48(%rsp),%rax
- lea 48(%rsp),%rsp
-.Lbase2_64_avx_epilogue:
- jmp .Ldo_avx
-
-.align 32
-.Leven_avx:
- vmovd 4*0($ctx),$H0 # load hash value
- vmovd 4*1($ctx),$H1
- vmovd 4*2($ctx),$H2
- vmovd 4*3($ctx),$H3
- vmovd 4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___ if (!$win64);
- lea -0x58(%rsp),%r11
- sub \$0x178,%rsp
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x218,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
- sub \$64,$len
- lea -32($inp),%rax
- cmovc %rax,$inp
-
- vmovdqu `16*3`($ctx),$D4 # preload r0^2
- lea `16*3+64`($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- ################################################################
- # load input
- vmovdqu 16*2($inp),$T0
- vmovdqu 16*3($inp),$T1
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- vpsrlq \$40,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- jbe .Lskip_loop_avx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*1-64`($ctx),$D1
- vmovdqu `16*2-64`($ctx),$D2
- vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
- vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
- vmovdqa $D3,-0x90(%r11)
- vmovdqa $D0,0x00(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vmovdqu `16*3-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x80(%r11)
- vmovdqa $D1,0x10(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqu `16*4-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x70(%r11)
- vmovdqa $D2,0x20(%rsp)
- vpshufd \$0xEE,$D0,$D4
- vmovdqu `16*5-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D4,-0x60(%r11)
- vmovdqa $D0,0x30(%rsp)
- vpshufd \$0xEE,$D1,$D3
- vmovdqu `16*6-64`($ctx),$D0
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D3,-0x50(%r11)
- vmovdqa $D1,0x40(%rsp)
- vpshufd \$0xEE,$D2,$D4
- vmovdqu `16*7-64`($ctx),$D1
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D4,-0x40(%r11)
- vmovdqa $D2,0x50(%rsp)
- vpshufd \$0xEE,$D0,$D3
- vmovdqu `16*8-64`($ctx),$D2
- vpshufd \$0x44,$D0,$D0
- vmovdqa $D3,-0x30(%r11)
- vmovdqa $D0,0x60(%rsp)
- vpshufd \$0xEE,$D1,$D4
- vpshufd \$0x44,$D1,$D1
- vmovdqa $D4,-0x20(%r11)
- vmovdqa $D1,0x70(%rsp)
- vpshufd \$0xEE,$D2,$D3
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpshufd \$0x44,$D2,$D2
- vmovdqa $D3,-0x10(%r11)
- vmovdqa $D2,0x80(%rsp)
-
- jmp .Loop_avx
-
-.align 32
-.Loop_avx:
- ################################################################
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
- # \___________________/
- # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
- # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
- # \___________________/ \____________________/
- #
- # Note that we start with inp[2:3]*r^2. This is because it
- # doesn't depend on reduction in previous iteration.
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # though note that $Tx and $Hx are "reversed" in this section,
- # and $D4 is preloaded with r0^2...
-
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vmovdqa $H2,0x20(%r11) # offload hash
- vpmuludq $T2,$D4,$D2 # d3 = h2*r0
- vmovdqa 0x10(%rsp),$H2 # r1^2
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vmovdqa $H0,0x00(%r11) #
- vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
- vmovdqa $H1,0x10(%r11) #
- vpmuludq $T3,$H2,$H1 # h3*r1
- vpaddq $H0,$D0,$D0 # d0 += h4*s1
- vpaddq $H1,$D4,$D4 # d4 += h3*r1
- vmovdqa $H3,0x30(%r11) #
- vpmuludq $T2,$H2,$H0 # h2*r1
- vpmuludq $T1,$H2,$H1 # h1*r1
- vpaddq $H0,$D3,$D3 # d3 += h2*r1
- vmovdqa 0x30(%rsp),$H3 # r2^2
- vpaddq $H1,$D2,$D2 # d2 += h1*r1
- vmovdqa $H4,0x40(%r11) #
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpmuludq $T2,$H3,$H0 # h2*r2
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa 0x40(%rsp),$H4 # s2^2
- vpaddq $H0,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H3,$H1 # h1*r2
- vpmuludq $T0,$H3,$H3 # h0*r2
- vpaddq $H1,$D3,$D3 # d3 += h1*r2
- vmovdqa 0x50(%rsp),$H2 # r3^2
- vpaddq $H3,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H4,$H0 # h4*s2
- vpmuludq $T3,$H4,$H4 # h3*s2
- vpaddq $H0,$D1,$D1 # d1 += h4*s2
- vmovdqa 0x60(%rsp),$H3 # s3^2
- vpaddq $H4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa 0x80(%rsp),$H4 # s4^2
- vpmuludq $T1,$H2,$H1 # h1*r3
- vpmuludq $T0,$H2,$H2 # h0*r3
- vpaddq $H1,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $T4,$H3,$H0 # h4*s3
- vpmuludq $T3,$H3,$H1 # h3*s3
- vpaddq $H0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*0($inp),$H0 # load input
- vpaddq $H1,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H3,$H3 # h2*s3
- vpmuludq $T2,$H4,$T2 # h2*s4
- vpaddq $H3,$D0,$D0 # d0 += h2*s3
-
- vmovdqu 16*1($inp),$H1 #
- vpaddq $T2,$D1,$D1 # d1 += h2*s4
- vpmuludq $T3,$H4,$T3 # h3*s4
- vpmuludq $T4,$H4,$T4 # h4*s4
- vpsrldq \$6,$H0,$H2 # splat input
- vpaddq $T3,$D2,$D2 # d2 += h3*s4
- vpaddq $T4,$D3,$D3 # d3 += h4*s4
- vpsrldq \$6,$H1,$H3 #
- vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
- vpmuludq $T1,$H4,$T0 # h1*s4
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpaddq $T4,$D4,$D4 # d4 += h0*r4
- vmovdqa -0x90(%r11),$T4 # r0^4
- vpaddq $T0,$D0,$D0 # d0 += h1*s4
-
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- #vpsrlq \$40,$H4,$H4 # 4
- vpsrldq \$`40/8`,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpand 0(%rcx),$H4,$H4 # .Lmask24
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpaddq 0x00(%r11),$H0,$H0 # add hash value
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- lea 16*2($inp),%rax
- lea 16*4($inp),$inp
- sub \$64,$len
- cmovc %rax,$inp
-
- ################################################################
- # Now we accumulate (inp[0:1]+hash)*r^4
- ################################################################
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vmovdqa -0x80(%r11),$T2 # r1^4
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T0,$D2,$D2
- vpaddq $T1,$D3,$D3
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
- vpaddq $T4,$D4,$D4
-
- vpaddq $T0,$D0,$D0 # d0 += h4*s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vmovdqa -0x60(%r11),$T3 # r2^4
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpmuludq $H1,$T2,$T1 # h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T1,$D2,$D2 # d2 += h1*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
-
- vmovdqa -0x50(%r11),$T4 # s2^4
- vpmuludq $H2,$T3,$T0 # h2*r2
- vpmuludq $H1,$T3,$T1 # h1*r2
- vpaddq $T0,$D4,$D4 # d4 += h2*r2
- vpaddq $T1,$D3,$D3 # d3 += h1*r2
- vmovdqa -0x40(%r11),$T2 # r3^4
- vpmuludq $H0,$T3,$T3 # h0*r2
- vpmuludq $H4,$T4,$T0 # h4*s2
- vpaddq $T3,$D2,$D2 # d2 += h0*r2
- vpaddq $T0,$D1,$D1 # d1 += h4*s2
- vmovdqa -0x30(%r11),$T3 # s3^4
- vpmuludq $H3,$T4,$T4 # h3*s2
- vpmuludq $H1,$T2,$T1 # h1*r3
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
-
- vmovdqa -0x10(%r11),$T4 # s4^4
- vpaddq $T1,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T2,$T2 # h0*r3
- vpmuludq $H4,$T3,$T0 # h4*s3
- vpaddq $T2,$D3,$D3 # d3 += h0*r3
- vpaddq $T0,$D2,$D2 # d2 += h4*s3
- vmovdqu 16*2($inp),$T0 # load input
- vpmuludq $H3,$T3,$T2 # h3*s3
- vpmuludq $H2,$T3,$T3 # h2*s3
- vpaddq $T2,$D1,$D1 # d1 += h3*s3
- vmovdqu 16*3($inp),$T1 #
- vpaddq $T3,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H2,$T4,$H2 # h2*s4
- vpmuludq $H3,$T4,$H3 # h3*s4
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $H2,$D1,$D1 # d1 += h2*s4
- vpmuludq $H4,$T4,$H4 # h4*s4
- vpsrldq \$6,$T1,$T3 #
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
- vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
- vpmuludq $H1,$T4,$H0
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpunpcklqdq $T3,$T2,$T3 # 2:3
-
- #vpsrlq \$40,$T4,$T4 # 4
- vpsrldq \$`40/8`,$T4,$T4 # 4
- vpsrlq \$26,$T0,$T1
- vmovdqa 0x00(%rsp),$D4 # preload r0^2
- vpand $MASK,$T0,$T0 # 0
- vpsrlq \$4,$T3,$T2
- vpand $MASK,$T1,$T1 # 1
- vpand 0(%rcx),$T4,$T4 # .Lmask24
- vpsrlq \$30,$T3,$T3
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- ################################################################
- # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
- # and P. Schwabe
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D0
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D0,$H0,$H0
- vpsllq \$2,$D0,$D0
- vpaddq $D0,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- ja .Loop_avx
-
-.Lskip_loop_avx:
- ################################################################
- # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
- vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
- add \$32,$len
- jnz .Long_tail_avx
-
- vpaddq $H2,$T2,$T2
- vpaddq $H0,$T0,$T0
- vpaddq $H1,$T1,$T1
- vpaddq $H3,$T3,$T3
- vpaddq $H4,$T4,$T4
-
-.Long_tail_avx:
- vmovdqa $H2,0x20(%r11)
- vmovdqa $H0,0x00(%r11)
- vmovdqa $H1,0x10(%r11)
- vmovdqa $H3,0x30(%r11)
- vmovdqa $H4,0x40(%r11)
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
- vpmuludq $T2,$D4,$D2 # d2 = h2*r0
- vpmuludq $T0,$D4,$D0 # d0 = h0*r0
- vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
- vpmuludq $T1,$D4,$D1 # d1 = h1*r0
- vpmuludq $T3,$D4,$D3 # d3 = h3*r0
- vpmuludq $T4,$D4,$D4 # d4 = h4*r0
-
- vpmuludq $T3,$H2,$H0 # h3*r1
- vpaddq $H0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
- vpmuludq $T2,$H2,$H1 # h2*r1
- vpaddq $H1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
- vpmuludq $T1,$H2,$H0 # h1*r1
- vpaddq $H0,$D2,$D2 # d2 += h1*r1
- vpmuludq $T0,$H2,$H2 # h0*r1
- vpaddq $H2,$D1,$D1 # d1 += h0*r1
- vpmuludq $T4,$H3,$H3 # h4*s1
- vpaddq $H3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
- vpmuludq $T2,$H4,$H1 # h2*r2
- vpaddq $H1,$D4,$D4 # d4 += h2*r2
- vpmuludq $T1,$H4,$H0 # h1*r2
- vpaddq $H0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
- vpmuludq $T0,$H4,$H4 # h0*r2
- vpaddq $H4,$D2,$D2 # d2 += h0*r2
- vpmuludq $T4,$H2,$H1 # h4*s2
- vpaddq $H1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
- vpmuludq $T3,$H2,$H2 # h3*s2
- vpaddq $H2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $T1,$H3,$H0 # h1*r3
- vpaddq $H0,$D4,$D4 # d4 += h1*r3
- vpmuludq $T0,$H3,$H3 # h0*r3
- vpaddq $H3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
- vpmuludq $T4,$H4,$H1 # h4*s3
- vpaddq $H1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
- vpmuludq $T3,$H4,$H0 # h3*s3
- vpaddq $H0,$D1,$D1 # d1 += h3*s3
- vpmuludq $T2,$H4,$H4 # h2*s3
- vpaddq $H4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $T0,$H2,$H2 # h0*r4
- vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
- vpmuludq $T4,$H3,$H1 # h4*s4
- vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
- vpmuludq $T3,$H3,$H0 # h3*s4
- vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
- vpmuludq $T2,$H3,$H1 # h2*s4
- vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
- vpmuludq $T1,$H3,$H3 # h1*s4
- vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
-
- jz .Lshort_tail_avx
-
- vmovdqu 16*0($inp),$H0 # load input
- vmovdqu 16*1($inp),$H1
-
- vpsrldq \$6,$H0,$H2 # splat input
- vpsrldq \$6,$H1,$H3
- vpunpckhqdq $H1,$H0,$H4 # 4
- vpunpcklqdq $H1,$H0,$H0 # 0:1
- vpunpcklqdq $H3,$H2,$H3 # 2:3
-
- vpsrlq \$40,$H4,$H4 # 4
- vpsrlq \$26,$H0,$H1
- vpand $MASK,$H0,$H0 # 0
- vpsrlq \$4,$H3,$H2
- vpand $MASK,$H1,$H1 # 1
- vpsrlq \$30,$H3,$H3
- vpand $MASK,$H2,$H2 # 2
- vpand $MASK,$H3,$H3 # 3
- vpor 32(%rcx),$H4,$H4 # padbit, yes, always
-
- vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
- vpaddq 0x00(%r11),$H0,$H0
- vpaddq 0x10(%r11),$H1,$H1
- vpaddq 0x20(%r11),$H2,$H2
- vpaddq 0x30(%r11),$H3,$H3
- vpaddq 0x40(%r11),$H4,$H4
-
- ################################################################
- # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
- vpmuludq $H0,$T4,$T0 # h0*r0
- vpaddq $T0,$D0,$D0 # d0 += h0*r0
- vpmuludq $H1,$T4,$T1 # h1*r0
- vpaddq $T1,$D1,$D1 # d1 += h1*r0
- vpmuludq $H2,$T4,$T0 # h2*r0
- vpaddq $T0,$D2,$D2 # d2 += h2*r0
- vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
- vpmuludq $H3,$T4,$T1 # h3*r0
- vpaddq $T1,$D3,$D3 # d3 += h3*r0
- vpmuludq $H4,$T4,$T4 # h4*r0
- vpaddq $T4,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T2,$T0 # h3*r1
- vpaddq $T0,$D4,$D4 # d4 += h3*r1
- vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
- vpmuludq $H2,$T2,$T1 # h2*r1
- vpaddq $T1,$D3,$D3 # d3 += h2*r1
- vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
- vpmuludq $H1,$T2,$T0 # h1*r1
- vpaddq $T0,$D2,$D2 # d2 += h1*r1
- vpmuludq $H0,$T2,$T2 # h0*r1
- vpaddq $T2,$D1,$D1 # d1 += h0*r1
- vpmuludq $H4,$T3,$T3 # h4*s1
- vpaddq $T3,$D0,$D0 # d0 += h4*s1
-
- vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
- vpmuludq $H2,$T4,$T1 # h2*r2
- vpaddq $T1,$D4,$D4 # d4 += h2*r2
- vpmuludq $H1,$T4,$T0 # h1*r2
- vpaddq $T0,$D3,$D3 # d3 += h1*r2
- vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
- vpmuludq $H0,$T4,$T4 # h0*r2
- vpaddq $T4,$D2,$D2 # d2 += h0*r2
- vpmuludq $H4,$T2,$T1 # h4*s2
- vpaddq $T1,$D1,$D1 # d1 += h4*s2
- vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
- vpmuludq $H3,$T2,$T2 # h3*s2
- vpaddq $T2,$D0,$D0 # d0 += h3*s2
-
- vpmuludq $H1,$T3,$T0 # h1*r3
- vpaddq $T0,$D4,$D4 # d4 += h1*r3
- vpmuludq $H0,$T3,$T3 # h0*r3
- vpaddq $T3,$D3,$D3 # d3 += h0*r3
- vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
- vpmuludq $H4,$T4,$T1 # h4*s3
- vpaddq $T1,$D2,$D2 # d2 += h4*s3
- vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
- vpmuludq $H3,$T4,$T0 # h3*s3
- vpaddq $T0,$D1,$D1 # d1 += h3*s3
- vpmuludq $H2,$T4,$T4 # h2*s3
- vpaddq $T4,$D0,$D0 # d0 += h2*s3
-
- vpmuludq $H0,$T2,$T2 # h0*r4
- vpaddq $T2,$D4,$D4 # d4 += h0*r4
- vpmuludq $H4,$T3,$T1 # h4*s4
- vpaddq $T1,$D3,$D3 # d3 += h4*s4
- vpmuludq $H3,$T3,$T0 # h3*s4
- vpaddq $T0,$D2,$D2 # d2 += h3*s4
- vpmuludq $H2,$T3,$T1 # h2*s4
- vpaddq $T1,$D1,$D1 # d1 += h2*s4
- vpmuludq $H1,$T3,$T3 # h1*s4
- vpaddq $T3,$D0,$D0 # d0 += h1*s4
-
-.Lshort_tail_avx:
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D4,$T4
- vpsrldq \$8,$D3,$T3
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$D0,$T0
- vpsrldq \$8,$D2,$T2
- vpaddq $T3,$D3,$D3
- vpaddq $T4,$D4,$D4
- vpaddq $T0,$D0,$D0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$D2,$D2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D4,$H4
- vpand $MASK,$D4,$D4
-
- vpsrlq \$26,$D1,$H1
- vpand $MASK,$D1,$D1
- vpaddq $H1,$D2,$D2 # h1 -> h2
-
- vpaddq $H4,$D0,$D0
- vpsllq \$2,$H4,$H4
- vpaddq $H4,$D0,$D0 # h4 -> h0
-
- vpsrlq \$26,$D2,$H2
- vpand $MASK,$D2,$D2
- vpaddq $H2,$D3,$D3 # h2 -> h3
-
- vpsrlq \$26,$D0,$H0
- vpand $MASK,$D0,$D0
- vpaddq $H0,$D1,$D1 # h0 -> h1
-
- vpsrlq \$26,$D3,$H3
- vpand $MASK,$D3,$D3
- vpaddq $H3,$D4,$D4 # h3 -> h4
-
- vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
- vmovd $D1,`4*1-48-64`($ctx)
- vmovd $D2,`4*2-48-64`($ctx)
- vmovd $D3,`4*3-48-64`($ctx)
- vmovd $D4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea 0x58(%r11),%rsp
-___
-$code.=<<___;
- vzeroupper
- ret
-.size poly1305_blocks_avx,.-poly1305_blocks_avx
-
-.type poly1305_emit_avx,\@function,3
-.align 32
-poly1305_emit_avx:
- cmpl \$0,20($ctx) # is_base2_26?
- je .Lemit
-
- mov 0($ctx),%eax # load hash value base 2^26
- mov 4($ctx),%ecx
- mov 8($ctx),%r8d
- mov 12($ctx),%r11d
- mov 16($ctx),%r10d
-
- shl \$26,%rcx # base 2^26 -> base 2^64
- mov %r8,%r9
- shl \$52,%r8
- add %rcx,%rax
- shr \$12,%r9
- add %rax,%r8 # h0
- adc \$0,%r9
-
- shl \$14,%r11
- mov %r10,%rax
- shr \$24,%r10
- add %r11,%r9
- shl \$40,%rax
- add %rax,%r9 # h1
- adc \$0,%r10 # h2
-
- mov %r10,%rax # could be partially reduced, so reduce
- mov %r10,%rcx
- and \$3,%r10
- shr \$2,%rax
- and \$-4,%rcx
- add %rcx,%rax
- add %rax,%r8
- adc \$0,%r9
-
- mov %r8,%rax
- add \$5,%r8 # compare to modulus
- mov %r9,%rcx
- adc \$0,%r9
- adc \$0,%r10
- shr \$2,%r10 # did 130-bit value overfow?
- cmovnz %r8,%rax
- cmovnz %r9,%rcx
-
- add 0($nonce),%rax # accumulate nonce
- adc 8($nonce),%rcx
- mov %rax,0($mac) # write result
- mov %rcx,8($mac)
-
- ret
-.size poly1305_emit_avx,.-poly1305_emit_avx
-___
-
-if ($avx>1) {
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
- map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-$code.=<<___;
-.type poly1305_blocks_avx2,\@function,4
-.align 32
-poly1305_blocks_avx2:
- mov 20($ctx),%r8d # is_base2_26
- cmp \$128,$len
- jae .Lblocks_avx2
- test %r8d,%r8d
- jz .Lblocks
-
-.Lblocks_avx2:
- and \$-16,$len
- jz .Lno_data_avx2
-
- vzeroupper
-
- test %r8d,%r8d
- jz .Lbase2_64_avx2
-
- test \$63,$len
- jz .Leven_avx2
-
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-.Lblocks_avx2_body:
-
- mov $len,%r15 # reassign $len
-
- mov 0($ctx),$d1 # load hash value
- mov 8($ctx),$d2
- mov 16($ctx),$h2#d
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- ################################# base 2^26 -> base 2^64
- mov $d1#d,$h0#d
- and \$-1<<31,$d1
- mov $d2,$r1 # borrow $r1
- mov $d2#d,$h1#d
- and \$-1<<31,$d2
-
- shr \$6,$d1
- shl \$52,$r1
- add $d1,$h0
- shr \$12,$h1
- shr \$18,$d2
- add $r1,$h0
- adc $d2,$h1
-
- mov $h2,$d1
- shl \$40,$d1
- shr \$24,$h2
- add $d1,$h1
- adc \$0,$h2 # can be partially reduced...
-
- mov \$-4,$d2 # ... so reduce
- mov $h2,$d1
- and $h2,$d2
- shr \$2,$d1
- and \$3,$h2
- add $d2,$d1 # =*5
- add $d1,$h0
- adc \$0,$h1
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_26_pre_avx2
-
- test $padbit,$padbit # if $padbit is zero,
- jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
-
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$r0
- mov $h1,$r1
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$r0
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $r0,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$r1
- and \$0x3ffffff,$h1 # h[3]
- or $r1,$h2 # h[4]
-
- test %r15,%r15
- jz .Lstore_base2_26_avx2
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- jmp .Lproceed_avx2
-
-.align 32
-.Lstore_base2_64_avx2:
- mov $h0,0($ctx)
- mov $h1,8($ctx)
- mov $h2,16($ctx) # note that is_base2_26 is zeroed
- jmp .Ldone_avx2
-
-.align 16
-.Lstore_base2_26_avx2:
- mov %rax#d,0($ctx) # store hash value base 2^26
- mov %rdx#d,4($ctx)
- mov $h0#d,8($ctx)
- mov $h1#d,12($ctx)
- mov $h2#d,16($ctx)
-.align 16
-.Ldone_avx2:
- mov 0(%rsp),%r15
- mov 8(%rsp),%r14
- mov 16(%rsp),%r13
- mov 24(%rsp),%r12
- mov 32(%rsp),%rbp
- mov 40(%rsp),%rbx
- lea 48(%rsp),%rsp
-.Lno_data_avx2:
-.Lblocks_avx2_epilogue:
- ret
-
-.align 32
-.Lbase2_64_avx2:
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
-.Lbase2_64_avx2_body:
-
- mov $len,%r15 # reassign $len
-
- mov 24($ctx),$r0 # load r
- mov 32($ctx),$s1
-
- mov 0($ctx),$h0 # load hash value
- mov 8($ctx),$h1
- mov 16($ctx),$h2#d
-
- mov $s1,$r1
- mov $s1,%rax
- shr \$2,$s1
- add $r1,$s1 # s1 = r1 + (r1 >> 2)
-
- test \$63,$len
- jz .Linit_avx2
-
-.Lbase2_64_pre_avx2:
- add 0($inp),$h0 # accumulate input
- adc 8($inp),$h1
- lea 16($inp),$inp
- adc $padbit,$h2
- sub \$16,%r15
-
- call __poly1305_block
- mov $r1,%rax
-
- test \$63,%r15
- jnz .Lbase2_64_pre_avx2
-
-.Linit_avx2:
- ################################# base 2^64 -> base 2^26
- mov $h0,%rax
- mov $h0,%rdx
- shr \$52,$h0
- mov $h1,$d1
- mov $h1,$d2
- shr \$26,%rdx
- and \$0x3ffffff,%rax # h[0]
- shl \$12,$d1
- and \$0x3ffffff,%rdx # h[1]
- shr \$14,$h1
- or $d1,$h0
- shl \$24,$h2
- and \$0x3ffffff,$h0 # h[2]
- shr \$40,$d2
- and \$0x3ffffff,$h1 # h[3]
- or $d2,$h2 # h[4]
-
- vmovd %rax#d,%x#$H0
- vmovd %rdx#d,%x#$H1
- vmovd $h0#d,%x#$H2
- vmovd $h1#d,%x#$H3
- vmovd $h2#d,%x#$H4
- movl \$1,20($ctx) # set is_base2_26
-
- call __poly1305_init_avx
-
-.Lproceed_avx2:
- mov %r15,$len
-
- mov 0(%rsp),%r15
- mov 8(%rsp),%r14
- mov 16(%rsp),%r13
- mov 24(%rsp),%r12
- mov 32(%rsp),%rbp
- mov 40(%rsp),%rbx
- lea 48(%rsp),%rax
- lea 48(%rsp),%rsp
-.Lbase2_64_avx2_epilogue:
- jmp .Ldo_avx2
-
-.align 32
-.Leven_avx2:
- vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
- vmovd 4*1($ctx),%x#$H1
- vmovd 4*2($ctx),%x#$H2
- vmovd 4*3($ctx),%x#$H3
- vmovd 4*4($ctx),%x#$H4
-
-.Ldo_avx2:
-___
-$code.=<<___ if (!$win64);
- lea -8(%rsp),%r11
- sub \$0x128,%rsp
-___
-$code.=<<___ if ($win64);
- lea -0xf8(%rsp),%r11
- sub \$0x1c8,%rsp
- vmovdqa %xmm6,0x50(%r11)
- vmovdqa %xmm7,0x60(%r11)
- vmovdqa %xmm8,0x70(%r11)
- vmovdqa %xmm9,0x80(%r11)
- vmovdqa %xmm10,0x90(%r11)
- vmovdqa %xmm11,0xa0(%r11)
- vmovdqa %xmm12,0xb0(%r11)
- vmovdqa %xmm13,0xc0(%r11)
- vmovdqa %xmm14,0xd0(%r11)
- vmovdqa %xmm15,0xe0(%r11)
-.Ldo_avx2_body:
-___
-$code.=<<___;
- lea 48+64($ctx),$ctx # size optimization
- lea .Lconst(%rip),%rcx
-
- # expand and copy pre-calculated table to stack
- vmovdqu `16*0-64`($ctx),%x#$T2
- and \$-512,%rsp
- vmovdqu `16*1-64`($ctx),%x#$T3
- vmovdqu `16*2-64`($ctx),%x#$T4
- vmovdqu `16*3-64`($ctx),%x#$D0
- vmovdqu `16*4-64`($ctx),%x#$D1
- vmovdqu `16*5-64`($ctx),%x#$D2
- vmovdqu `16*6-64`($ctx),%x#$D3
- vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
- vmovdqu `16*7-64`($ctx),%x#$D4
- vpermq \$0x15,$T3,$T3
- vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
- vmovdqu `16*8-64`($ctx),%x#$MASK
- vpermq \$0x15,$T4,$T4
- vpshufd \$0xc8,$T3,$T3
- vmovdqa $T2,0x00(%rsp)
- vpermq \$0x15,$D0,$D0
- vpshufd \$0xc8,$T4,$T4
- vmovdqa $T3,0x20(%rsp)
- vpermq \$0x15,$D1,$D1
- vpshufd \$0xc8,$D0,$D0
- vmovdqa $T4,0x40(%rsp)
- vpermq \$0x15,$D2,$D2
- vpshufd \$0xc8,$D1,$D1
- vmovdqa $D0,0x60(%rsp)
- vpermq \$0x15,$D3,$D3
- vpshufd \$0xc8,$D2,$D2
- vmovdqa $D1,0x80(%rsp)
- vpermq \$0x15,$D4,$D4
- vpshufd \$0xc8,$D3,$D3
- vmovdqa $D2,0xa0(%rsp)
- vpermq \$0x15,$MASK,$MASK
- vpshufd \$0xc8,$D4,$D4
- vmovdqa $D3,0xc0(%rsp)
- vpshufd \$0xc8,$MASK,$MASK
- vmovdqa $D4,0xe0(%rsp)
- vmovdqa $MASK,0x100(%rsp)
- vmovdqa 64(%rcx),$MASK # .Lmask26
-
- ################################################################
- # load input
- vmovdqu 16*0($inp),%x#$T0
- vmovdqu 16*1($inp),%x#$T1
- vinserti128 \$1,16*2($inp),$T0,$T0
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpsrldq \$6,$T0,$T2 # splat input
- vpsrldq \$6,$T1,$T3
- vpunpckhqdq $T1,$T0,$T4 # 4
- vpunpcklqdq $T3,$T2,$T2 # 2:3
- vpunpcklqdq $T1,$T0,$T0 # 0:1
-
- vpsrlq \$30,$T2,$T3
- vpsrlq \$4,$T2,$T2
- vpsrlq \$26,$T0,$T1
- vpsrlq \$40,$T4,$T4 # 4
- vpand $MASK,$T2,$T2 # 2
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- lea 0x90(%rsp),%rax # size optimization
- vpaddq $H2,$T2,$H2 # accumulate input
- sub \$64,$len
- jz .Ltail_avx2
- jmp .Loop_avx2
-
-.align 32
-.Loop_avx2:
- ################################################################
- # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
- # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
- # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
- # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
- # \________/\________/
- ################################################################
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqa `32*0`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqa `32*1`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqa `32*3`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
- vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
-
- # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
- # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
- # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
- #
- # however, as h2 is "chronologically" first one available pull
- # corresponding operations up, so it's
- #
- # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
- # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
- # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
- # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
- # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
- vmovdqa `32*4-0x90`(%rax),$T1 # s2
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vmovdqu 16*0($inp),%x#$T0 # load input
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
- vinserti128 \$1,16*2($inp),$T0,$T0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vmovdqu 16*1($inp),%x#$T1
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqa `32*5-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
- vinserti128 \$1,16*3($inp),$T1,$T1
- lea 16*4($inp),$inp
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpsrldq \$6,$T0,$T2 # splat input
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpsrldq \$6,$T1,$T3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
- vpunpckhqdq $T1,$T0,$T4 # 4
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpunpcklqdq $T1,$T0,$T0 # 0:1
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpunpcklqdq $T3,$T2,$T3 # 2:3
- vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # lazy reduction (interleaved with tail of input splat)
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$4,$T3,$T2
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpand $MASK,$T2,$T2 # 2
- vpsrlq \$26,$T0,$T1
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpaddq $T2,$H2,$H2 # modulo-scheduled
- vpsrlq \$30,$T3,$T3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$40,$T4,$T4 # 4
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpand $MASK,$T0,$T0 # 0
- vpand $MASK,$T1,$T1 # 1
- vpand $MASK,$T3,$T3 # 3
- vpor 32(%rcx),$T4,$T4 # padbit, yes, always
-
- sub \$64,$len
- jnz .Loop_avx2
-
- .byte 0x66,0x90
-.Ltail_avx2:
- ################################################################
- # while above multiplications were by r^4 in all lanes, in last
- # iteration we multiply least significant lane by r^4 and most
- # significant one by r, so copy of above except that references
- # to the precomputed table are displaced by 4...
-
- #vpaddq $H2,$T2,$H2 # accumulate input
- vpaddq $H0,$T0,$H0
- vmovdqu `32*0+4`(%rsp),$T0 # r0^4
- vpaddq $H1,$T1,$H1
- vmovdqu `32*1+4`(%rsp),$T1 # r1^4
- vpaddq $H3,$T3,$H3
- vmovdqu `32*3+4`(%rsp),$T2 # r2^4
- vpaddq $H4,$T4,$H4
- vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
- vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
-
- vpmuludq $H2,$T0,$D2 # d2 = h2*r0
- vpmuludq $H2,$T1,$D3 # d3 = h2*r1
- vpmuludq $H2,$T2,$D4 # d4 = h2*r2
- vpmuludq $H2,$T3,$D0 # d0 = h2*s3
- vpmuludq $H2,$S4,$D1 # d1 = h2*s4
-
- vpmuludq $H0,$T1,$T4 # h0*r1
- vpmuludq $H1,$T1,$H2 # h1*r1
- vpaddq $T4,$D1,$D1 # d1 += h0*r1
- vpaddq $H2,$D2,$D2 # d2 += h1*r1
- vpmuludq $H3,$T1,$T4 # h3*r1
- vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
- vpaddq $T4,$D4,$D4 # d4 += h3*r1
- vpaddq $H2,$D0,$D0 # d0 += h4*s1
-
- vpmuludq $H0,$T0,$T4 # h0*r0
- vpmuludq $H1,$T0,$H2 # h1*r0
- vpaddq $T4,$D0,$D0 # d0 += h0*r0
- vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
- vpaddq $H2,$D1,$D1 # d1 += h1*r0
- vpmuludq $H3,$T0,$T4 # h3*r0
- vpmuludq $H4,$T0,$H2 # h4*r0
- vpaddq $T4,$D3,$D3 # d3 += h3*r0
- vpaddq $H2,$D4,$D4 # d4 += h4*r0
-
- vpmuludq $H3,$T1,$T4 # h3*s2
- vpmuludq $H4,$T1,$H2 # h4*s2
- vpaddq $T4,$D0,$D0 # d0 += h3*s2
- vpaddq $H2,$D1,$D1 # d1 += h4*s2
- vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
- vpmuludq $H1,$T2,$T4 # h1*r2
- vpmuludq $H0,$T2,$T2 # h0*r2
- vpaddq $T4,$D3,$D3 # d3 += h1*r2
- vpaddq $T2,$D2,$D2 # d2 += h0*r2
-
- vpmuludq $H1,$H2,$T4 # h1*r3
- vpmuludq $H0,$H2,$H2 # h0*r3
- vpaddq $T4,$D4,$D4 # d4 += h1*r3
- vpaddq $H2,$D3,$D3 # d3 += h0*r3
- vpmuludq $H3,$T3,$T4 # h3*s3
- vpmuludq $H4,$T3,$H2 # h4*s3
- vpaddq $T4,$D1,$D1 # d1 += h3*s3
- vpaddq $H2,$D2,$D2 # d2 += h4*s3
-
- vpmuludq $H3,$S4,$H3 # h3*s4
- vpmuludq $H4,$S4,$H4 # h4*s4
- vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
- vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
- vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
- vpmuludq $H1,$S4,$H0 # h1*s4
- vmovdqa 64(%rcx),$MASK # .Lmask26
- vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
- vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
-
- ################################################################
- # horizontal addition
-
- vpsrldq \$8,$D1,$T1
- vpsrldq \$8,$H2,$T2
- vpsrldq \$8,$H3,$T3
- vpsrldq \$8,$H4,$T4
- vpsrldq \$8,$H0,$T0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
-
- vpermq \$0x2,$H3,$T3
- vpermq \$0x2,$H4,$T4
- vpermq \$0x2,$H0,$T0
- vpermq \$0x2,$D1,$T1
- vpermq \$0x2,$H2,$T2
- vpaddq $T3,$H3,$H3
- vpaddq $T4,$H4,$H4
- vpaddq $T0,$H0,$H0
- vpaddq $T1,$D1,$D1
- vpaddq $T2,$H2,$H2
-
- ################################################################
- # lazy reduction
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$D1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H4,$D4
- vpand $MASK,$H4,$H4
-
- vpsrlq \$26,$H1,$D1
- vpand $MASK,$H1,$H1
- vpaddq $D1,$H2,$H2 # h1 -> h2
-
- vpaddq $D4,$H0,$H0
- vpsllq \$2,$D4,$D4
- vpaddq $D4,$H0,$H0 # h4 -> h0
-
- vpsrlq \$26,$H2,$D2
- vpand $MASK,$H2,$H2
- vpaddq $D2,$H3,$H3 # h2 -> h3
-
- vpsrlq \$26,$H0,$D0
- vpand $MASK,$H0,$H0
- vpaddq $D0,$H1,$H1 # h0 -> h1
-
- vpsrlq \$26,$H3,$D3
- vpand $MASK,$H3,$H3
- vpaddq $D3,$H4,$H4 # h3 -> h4
-
- vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
- vmovd %x#$H1,`4*1-48-64`($ctx)
- vmovd %x#$H2,`4*2-48-64`($ctx)
- vmovd %x#$H3,`4*3-48-64`($ctx)
- vmovd %x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___ if ($win64);
- vmovdqa 0x50(%r11),%xmm6
- vmovdqa 0x60(%r11),%xmm7
- vmovdqa 0x70(%r11),%xmm8
- vmovdqa 0x80(%r11),%xmm9
- vmovdqa 0x90(%r11),%xmm10
- vmovdqa 0xa0(%r11),%xmm11
- vmovdqa 0xb0(%r11),%xmm12
- vmovdqa 0xc0(%r11),%xmm13
- vmovdqa 0xd0(%r11),%xmm14
- vmovdqa 0xe0(%r11),%xmm15
- lea 0xf8(%r11),%rsp
-.Ldo_avx2_epilogue:
-___
-$code.=<<___ if (!$win64);
- lea 8(%r11),%rsp
-___
-$code.=<<___;
- vzeroupper
- ret
-.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
-___
-}
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long 1<<24,0,1<<24,0,1<<24,0,1<<24,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long 5,0,5,0,5,0,5,0
-___
-}
-
-$code.=<<___;
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type se_handler,\@abi-omnipotent
-.align 16
-se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<.Lprologue
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=.Lepilogue
- jae .Lcommon_seh_tail
-
- lea 48(%rax),%rax
-
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R14
-
- jmp .Lcommon_seh_tail
-.size se_handler,.-se_handler
-
-.type avx_handler,\@abi-omnipotent
-.align 16
-avx_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
-
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
-
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
-
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
-
- mov 152($context),%rax # pull context->Rsp
-
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
-
- mov 208($context),%rax # pull context->R11
-
- lea 0x50(%rax),%rsi
- lea 0xf8(%rax),%rax
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
-
-.Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
-
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
-
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
-
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
-.size avx_handler,.-avx_handler
-
-.section .pdata
-.align 4
- .rva .LSEH_begin_poly1305_init
- .rva .LSEH_end_poly1305_init
- .rva .LSEH_info_poly1305_init
-
- .rva .LSEH_begin_poly1305_blocks
- .rva .LSEH_end_poly1305_blocks
- .rva .LSEH_info_poly1305_blocks
-
- .rva .LSEH_begin_poly1305_emit
- .rva .LSEH_end_poly1305_emit
- .rva .LSEH_info_poly1305_emit
-___
-$code.=<<___ if ($avx);
- .rva .LSEH_begin_poly1305_blocks_avx
- .rva .Lbase2_64_avx
- .rva .LSEH_info_poly1305_blocks_avx_1
-
- .rva .Lbase2_64_avx
- .rva .Leven_avx
- .rva .LSEH_info_poly1305_blocks_avx_2
-
- .rva .Leven_avx
- .rva .LSEH_end_poly1305_blocks_avx
- .rva .LSEH_info_poly1305_blocks_avx_3
-
- .rva .LSEH_begin_poly1305_emit_avx
- .rva .LSEH_end_poly1305_emit_avx
- .rva .LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
- .rva .LSEH_begin_poly1305_blocks_avx2
- .rva .Lbase2_64_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_1
-
- .rva .Lbase2_64_avx2
- .rva .Leven_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_2
-
- .rva .Leven_avx2
- .rva .LSEH_end_poly1305_blocks_avx2
- .rva .LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___;
-.section .xdata
-.align 8
-.LSEH_info_poly1305_init:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
-
-.LSEH_info_poly1305_blocks:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_poly1305_emit:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
- .byte 9,0,0,0
- .rva se_handler
- .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
- .byte 9,0,0,0
- .rva avx_handler
- .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
-___
-}
-
-foreach (split('\n',$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- s/%r([a-z]+)#d/%e$1/g;
- s/%r([0-9]+)#d/%r$1d/g;
- s/%x#%y/%x/g;
-
- print $_,"\n";
-}
-close STDOUT;
diff --git a/src/crypto/test/CMakeLists.txt b/src/crypto/test/CMakeLists.txt
index 8c75314..8857913 100644
--- a/src/crypto/test/CMakeLists.txt
+++ b/src/crypto/test/CMakeLists.txt
@@ -7,3 +7,11 @@
malloc.cc
test_util.cc
)
+
+add_library(
+ gtest_main
+
+ OBJECT
+
+ gtest_main.cc
+)
diff --git a/src/crypto/test/gtest_main.cc b/src/crypto/test/gtest_main.cc
new file mode 100644
index 0000000..50147bc
--- /dev/null
+++ b/src/crypto/test/gtest_main.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <gtest/gtest.h>
+
+#include <openssl/crypto.h>
+
+int main(int argc, char **argv) {
+ CRYPTO_library_init();
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}