external/boringssl: Sync to 6d50f475e319de153a43e1dba5a1beca95948c63.

This includes the following changes:

https://boringssl.googlesource.com/boringssl/+log/0726fb76ebe7f422e3c4fb2e25a0064926975770..6d50f475e319de153a43e1dba5a1beca95948c63

This also updates the build files to add the new GTest-based targets and
work with the C++ file in libssl.

Test: cts-tradefed run cts -m CtsLibcoreOkHttpTestCases -a arm64-v8a
Test: cts-tradefed run cts -m CtsLibcoreTestCases -a arm64-v8a

Change-Id: I99718d51c901fe2e2e1e0398fc61fe1e76ccdb3f
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index 97fea5f..36224fc 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -207,3 +207,17 @@
 
 target_link_libraries(refcount_test crypto)
 add_dependencies(all_tests refcount_test)
+
+# TODO(davidben): Convert the remaining tests to GTest.
+add_executable(
+  crypto_test
+
+  dh/dh_test.cc
+  dsa/dsa_test.cc
+
+  $<TARGET_OBJECTS:gtest_main>
+  $<TARGET_OBJECTS:test_support>
+)
+
+target_link_libraries(crypto_test crypto gtest)
+add_dependencies(all_tests crypto_test)
diff --git a/src/crypto/cipher/CMakeLists.txt b/src/crypto/cipher/CMakeLists.txt
index 52b87b6..db46c4b 100644
--- a/src/crypto/cipher/CMakeLists.txt
+++ b/src/crypto/cipher/CMakeLists.txt
@@ -1,5 +1,13 @@
 include_directories(../../include)
 
+if (${ARCH} STREQUAL "x86_64")
+  set(
+    CIPHER_ARCH_SOURCES
+
+    chacha20_poly1305_x86_64.${ASM_EXT}
+  )
+endif()
+
 add_library(
   cipher
 
@@ -19,6 +27,8 @@
   tls_cbc.c
   e_tls.c
   e_ssl3.c
+
+  ${CIPHER_ARCH_SOURCES}
 )
 
 add_executable(
@@ -35,6 +45,8 @@
   $<TARGET_OBJECTS:test_support>
 )
 
+perlasm(chacha20_poly1305_x86_64.${ASM_EXT} asm/chacha20_poly1305_x86_64.pl)
+
 target_link_libraries(cipher_test crypto)
 target_link_libraries(aead_test crypto)
 add_dependencies(all_tests cipher_test aead_test)
diff --git a/src/crypto/cipher/aead_test.cc b/src/crypto/cipher/aead_test.cc
index 0c95fb4..fb5200e 100644
--- a/src/crypto/cipher/aead_test.cc
+++ b/src/crypto/cipher/aead_test.cc
@@ -12,6 +12,7 @@
  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
 
+#include <assert.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -168,16 +169,12 @@
 }
 
 static int TestCleanupAfterInitFailure(const EVP_AEAD *aead) {
-  EVP_AEAD_CTX ctx;
-  uint8_t key[128];
-
+  uint8_t key[EVP_AEAD_MAX_KEY_LENGTH];
   OPENSSL_memset(key, 0, sizeof(key));
   const size_t key_len = EVP_AEAD_key_length(aead);
-  if (key_len > sizeof(key)) {
-    fprintf(stderr, "Key length of AEAD too long.\n");
-    return 0;
-  }
+  assert(sizeof(key) >= key_len);
 
+  EVP_AEAD_CTX ctx;
   if (EVP_AEAD_CTX_init(&ctx, aead, key, key_len,
                         9999 /* a silly tag length to trigger an error */,
                         NULL /* ENGINE */) != 0) {
@@ -201,6 +198,80 @@
   return 1;
 }
 
+static int TestTruncatedTags(const EVP_AEAD *aead) {
+  uint8_t key[EVP_AEAD_MAX_KEY_LENGTH];
+  OPENSSL_memset(key, 0, sizeof(key));
+  const size_t key_len = EVP_AEAD_key_length(aead);
+  assert(sizeof(key) >= key_len);
+
+  uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH];
+  OPENSSL_memset(nonce, 0, sizeof(nonce));
+  const size_t nonce_len = EVP_AEAD_nonce_length(aead);
+  assert(sizeof(nonce) >= nonce_len);
+
+  bssl::ScopedEVP_AEAD_CTX ctx;
+  if (!EVP_AEAD_CTX_init(ctx.get(), aead, key, key_len, 1 /* one byte tag */,
+                         NULL /* ENGINE */)) {
+    fprintf(stderr, "Couldn't initialise AEAD with truncated tag.\n");
+    return 1;
+  }
+
+  const uint8_t plaintext[1] = {'A'};
+
+  uint8_t ciphertext[128];
+  size_t ciphertext_len;
+  constexpr uint8_t kSentinel = 42;
+  OPENSSL_memset(ciphertext, kSentinel, sizeof(ciphertext));
+
+  if (!EVP_AEAD_CTX_seal(ctx.get(), ciphertext, &ciphertext_len,
+                         sizeof(ciphertext), nonce, nonce_len, plaintext,
+                         sizeof(plaintext), nullptr /* ad */, 0)) {
+    fprintf(stderr, "Sealing with truncated tag didn't work.\n");
+    return 0;
+  }
+
+  for (size_t i = ciphertext_len; i < sizeof(ciphertext); i++) {
+    // Sealing must not write past where it said it did.
+    if (ciphertext[i] != kSentinel) {
+      fprintf(stderr, "Sealing wrote off the end of the buffer.\n");
+      return 0;
+    }
+  }
+
+  const size_t overhead_used = ciphertext_len - sizeof(plaintext);
+  if (overhead_used != 1) {
+    fprintf(stderr, "AEAD is probably ignoring request to truncate tags.\n");
+    return 0;
+  }
+
+  uint8_t plaintext2[sizeof(plaintext) + 16];
+  OPENSSL_memset(plaintext2, kSentinel, sizeof(plaintext2));
+
+  size_t plaintext2_len;
+  if (!EVP_AEAD_CTX_open(ctx.get(), plaintext2, &plaintext2_len,
+                         sizeof(plaintext2), nonce, nonce_len, ciphertext,
+                         ciphertext_len, nullptr /* ad */, 0)) {
+    fprintf(stderr, "Opening with truncated tag didn't work.\n");
+    return 0;
+  }
+
+  for (size_t i = plaintext2_len; i < sizeof(plaintext2); i++) {
+    // Likewise, opening should also stay within bounds.
+    if (plaintext2[i] != kSentinel) {
+      fprintf(stderr, "Opening wrote off the end of the buffer.\n");
+      return 0;
+    }
+  }
+
+  if (plaintext2_len != sizeof(plaintext) ||
+      OPENSSL_memcmp(plaintext2, plaintext, sizeof(plaintext)) != 0) {
+    fprintf(stderr, "Opening with truncated tag gave wrong result.\n");
+    return 0;
+  }
+
+  return 1;
+}
+
 static bool TestWithAliasedBuffers(const EVP_AEAD *aead) {
   const size_t key_len = EVP_AEAD_key_length(aead);
   const size_t nonce_len = EVP_AEAD_nonce_length(aead);
@@ -306,30 +377,32 @@
   // handle inputs that are a multiple of eight bytes in length and the
   // SSLv3/TLS AEADs have the concept of “direction”.
   bool limited_implementation;
+  // truncated_tags is true if the AEAD supports truncating tags to arbitrary
+  // lengths.
+  bool truncated_tags;
 };
 
 static const struct KnownAEAD kAEADs[] = {
-  { "aes-128-gcm", EVP_aead_aes_128_gcm, false },
-  { "aes-256-gcm", EVP_aead_aes_256_gcm, false },
-  { "aes-128-gcm-siv", EVP_aead_aes_128_gcm_siv, false },
-  { "aes-256-gcm-siv", EVP_aead_aes_256_gcm_siv, false },
-  { "chacha20-poly1305", EVP_aead_chacha20_poly1305, false },
-  { "chacha20-poly1305-old", EVP_aead_chacha20_poly1305_old, false },
-  { "aes-128-cbc-sha1-tls", EVP_aead_aes_128_cbc_sha1_tls, true },
-  { "aes-128-cbc-sha1-tls-implicit-iv", EVP_aead_aes_128_cbc_sha1_tls_implicit_iv, true },
-  { "aes-128-cbc-sha256-tls", EVP_aead_aes_128_cbc_sha256_tls, true },
-  { "aes-256-cbc-sha1-tls", EVP_aead_aes_256_cbc_sha1_tls, true },
-  { "aes-256-cbc-sha1-tls-implicit-iv", EVP_aead_aes_256_cbc_sha1_tls_implicit_iv, true },
-  { "aes-256-cbc-sha256-tls", EVP_aead_aes_256_cbc_sha256_tls, true },
-  { "aes-256-cbc-sha384-tls", EVP_aead_aes_256_cbc_sha384_tls, true },
-  { "des-ede3-cbc-sha1-tls", EVP_aead_des_ede3_cbc_sha1_tls, true },
-  { "des-ede3-cbc-sha1-tls-implicit-iv", EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv, true },
-  { "aes-128-cbc-sha1-ssl3", EVP_aead_aes_128_cbc_sha1_ssl3, true },
-  { "aes-256-cbc-sha1-ssl3", EVP_aead_aes_256_cbc_sha1_ssl3, true },
-  { "des-ede3-cbc-sha1-ssl3", EVP_aead_des_ede3_cbc_sha1_ssl3, true },
-  { "aes-128-ctr-hmac-sha256", EVP_aead_aes_128_ctr_hmac_sha256, false },
-  { "aes-256-ctr-hmac-sha256", EVP_aead_aes_256_ctr_hmac_sha256, false },
-  { "", NULL, false },
+  { "aes-128-gcm", EVP_aead_aes_128_gcm, false, true },
+  { "aes-256-gcm", EVP_aead_aes_256_gcm, false, true },
+  { "aes-128-gcm-siv", EVP_aead_aes_128_gcm_siv, false, false },
+  { "aes-256-gcm-siv", EVP_aead_aes_256_gcm_siv, false, false },
+  { "chacha20-poly1305", EVP_aead_chacha20_poly1305, false, true },
+  { "aes-128-cbc-sha1-tls", EVP_aead_aes_128_cbc_sha1_tls, true, false },
+  { "aes-128-cbc-sha1-tls-implicit-iv", EVP_aead_aes_128_cbc_sha1_tls_implicit_iv, true, false },
+  { "aes-128-cbc-sha256-tls", EVP_aead_aes_128_cbc_sha256_tls, true, false },
+  { "aes-256-cbc-sha1-tls", EVP_aead_aes_256_cbc_sha1_tls, true, false },
+  { "aes-256-cbc-sha1-tls-implicit-iv", EVP_aead_aes_256_cbc_sha1_tls_implicit_iv, true, false },
+  { "aes-256-cbc-sha256-tls", EVP_aead_aes_256_cbc_sha256_tls, true, false },
+  { "aes-256-cbc-sha384-tls", EVP_aead_aes_256_cbc_sha384_tls, true, false },
+  { "des-ede3-cbc-sha1-tls", EVP_aead_des_ede3_cbc_sha1_tls, true, false },
+  { "des-ede3-cbc-sha1-tls-implicit-iv", EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv, true, false },
+  { "aes-128-cbc-sha1-ssl3", EVP_aead_aes_128_cbc_sha1_ssl3, true, false },
+  { "aes-256-cbc-sha1-ssl3", EVP_aead_aes_256_cbc_sha1_ssl3, true, false },
+  { "des-ede3-cbc-sha1-ssl3", EVP_aead_des_ede3_cbc_sha1_ssl3, true, false },
+  { "aes-128-ctr-hmac-sha256", EVP_aead_aes_128_ctr_hmac_sha256, false, true },
+  { "aes-256-ctr-hmac-sha256", EVP_aead_aes_256_ctr_hmac_sha256, false, true },
+  { "", NULL, false, false },
 };
 
 int main(int argc, char **argv) {
@@ -363,6 +436,11 @@
     return 1;
   }
 
+  if (known_aead->truncated_tags && !TestTruncatedTags(aead)) {
+    fprintf(stderr, "Truncated tags test failed for %s.\n", known_aead->name);
+    return 1;
+  }
+
   if (!known_aead->limited_implementation && !TestWithAliasedBuffers(aead)) {
     fprintf(stderr, "Aliased buffers test failed for %s.\n", known_aead->name);
     return 1;
diff --git a/src/crypto/cipher/asm/chacha20_poly1305_x86_64.pl b/src/crypto/cipher/asm/chacha20_poly1305_x86_64.pl
new file mode 100644
index 0000000..c3f3e0b
--- /dev/null
+++ b/src/crypto/cipher/asm/chacha20_poly1305_x86_64.pl
@@ -0,0 +1,2379 @@
+#!/usr/bin/env perl
+
+# Copyright (c) 2015, CloudFlare Ltd.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+##############################################################################
+#                                                                            #
+# Author:  Vlad Krasnov                                                      #
+#                                                                            #
+##############################################################################
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$avx = 2;
+
+$code.=<<___;
+.text
+.extern OPENSSL_ia32cap_P
+.align 64
+.chacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.rol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.rol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.avx2_init:
+.long 0,0,0,0
+.sse_inc:
+.long 1,0,0,0
+.avx2_inc:
+.long 2,0,0,0,2,0,0,0
+.clamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align 16
+.and_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+___
+
+my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
+my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
+my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
+my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
+my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
+my $r_store="0*16(%rbp)";
+my $s_store="1*16(%rbp)";
+my $len_store="2*16(%rbp)";
+my $state1_store="3*16(%rbp)";
+my $state2_store="4*16(%rbp)";
+my $tmp_store="5*16(%rbp)";
+my $ctr0_store="6*16(%rbp)";
+my $ctr1_store="7*16(%rbp)";
+my $ctr2_store="8*16(%rbp)";
+my $ctr3_store="9*16(%rbp)";
+
+sub chacha_qr {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
+$code.="paddd $b, $a
+        pxor $a, $d
+        pshufb .rol16(%rip), $d
+        paddd $d, $c
+        pxor $c, $b
+        movdqa $b, $t
+        pslld \$12, $t
+        psrld \$20, $b
+        pxor $t, $b
+        paddd $b, $a
+        pxor $a, $d
+        pshufb .rol8(%rip), $d
+        paddd $d, $c
+        pxor $c, $b
+        movdqa $b, $t
+        pslld \$7, $t
+        psrld \$25, $b
+        pxor $t, $b\n";
+$code.="palignr \$4, $b, $b
+        palignr \$8, $c, $c
+        palignr \$12, $d, $d\n" if ($dir =~ /left/);
+$code.="palignr \$12, $b, $b
+        palignr \$8, $c, $c
+        palignr \$4, $d, $d\n" if ($dir =~ /right/);
+$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
+}
+
+sub poly_add {
+my ($src)=@_;
+$code.="add $src, $acc0
+        adc 8+$src, $acc1
+        adc \$1, $acc2\n";
+}
+
+sub poly_stage1 {
+$code.="mov 0+$r_store, %rax
+        mov %rax, $t2
+        mul $acc0
+        mov %rax, $t0
+        mov %rdx, $t1
+        mov 0+$r_store, %rax
+        mul $acc1
+        imul $acc2, $t2
+        add %rax, $t1
+        adc %rdx, $t2\n";
+}
+
+sub poly_stage2 {
+$code.="mov 8+$r_store, %rax
+        mov %rax, $t3
+        mul $acc0
+        add %rax, $t1
+        adc \$0, %rdx
+        mov %rdx, $acc0
+        mov 8+$r_store, %rax
+        mul $acc1
+        add %rax, $t2
+        adc \$0, %rdx\n";
+}
+
+sub poly_stage3 {
+$code.="imul $acc2, $t3
+        add $acc0, $t2
+        adc %rdx, $t3\n";
+}
+
+sub poly_reduce_stage {
+$code.="mov $t0, $acc0
+        mov $t1, $acc1
+        mov $t2, $acc2
+        and \$3, $acc2
+        mov $t2, $t0
+        and \$-4, $t0
+        mov $t3, $t1
+        shrd \$2, $t3, $t2
+        shr \$2, $t3
+        add $t0, $acc0
+        adc $t1, $acc1
+        adc \$0, $acc2
+        add $t2, $acc0
+        adc $t3, $acc1
+        adc \$0, $acc2\n";
+}
+
+sub poly_mul {
+    &poly_stage1();
+    &poly_stage2();
+    &poly_stage3();
+    &poly_reduce_stage();
+}
+
+sub prep_state {
+my ($n)=@_;
+$code.="movdqa .chacha20_consts(%rip), $A0
+        movdqa $state1_store, $B0
+        movdqa $state2_store, $C0\n";
+$code.="movdqa $A0, $A1
+        movdqa $B0, $B1
+        movdqa $C0, $C1\n" if ($n ge 2);
+$code.="movdqa $A0, $A2
+        movdqa $B0, $B2
+        movdqa $C0, $C2\n" if ($n ge 3);
+$code.="movdqa $A0, $A3
+        movdqa $B0, $B3
+        movdqa $C0, $C3\n" if ($n ge 4);
+$code.="movdqa $ctr0_store, $D0
+        paddd .sse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store\n" if ($n eq 1);
+$code.="movdqa $ctr0_store, $D1
+        paddd .sse_inc(%rip), $D1
+        movdqa $D1, $D0
+        paddd .sse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store
+        movdqa $D1, $ctr1_store\n" if ($n eq 2);
+$code.="movdqa $ctr0_store, $D2
+        paddd .sse_inc(%rip), $D2
+        movdqa $D2, $D1
+        paddd .sse_inc(%rip), $D1
+        movdqa $D1, $D0
+        paddd .sse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store
+        movdqa $D1, $ctr1_store
+        movdqa $D2, $ctr2_store\n" if ($n eq 3);
+$code.="movdqa $ctr0_store, $D3
+        paddd .sse_inc(%rip), $D3
+        movdqa $D3, $D2
+        paddd .sse_inc(%rip), $D2
+        movdqa $D2, $D1
+        paddd .sse_inc(%rip), $D1
+        movdqa $D1, $D0
+        paddd .sse_inc(%rip), $D0
+        movdqa $D0, $ctr0_store
+        movdqa $D1, $ctr1_store
+        movdqa $D2, $ctr2_store
+        movdqa $D3, $ctr3_store\n" if ($n eq 4);
+}
+
+sub finalize_state {
+my ($n)=@_;
+$code.="paddd .chacha20_consts(%rip), $A3
+        paddd $state1_store, $B3
+        paddd $state2_store, $C3
+        paddd $ctr3_store, $D3\n" if ($n eq 4);
+$code.="paddd .chacha20_consts(%rip), $A2
+        paddd $state1_store, $B2
+        paddd $state2_store, $C2
+        paddd $ctr2_store, $D2\n" if ($n ge 3);
+$code.="paddd .chacha20_consts(%rip), $A1
+        paddd $state1_store, $B1
+        paddd $state2_store, $C1
+        paddd $ctr1_store, $D1\n" if ($n ge 2);
+$code.="paddd .chacha20_consts(%rip), $A0
+        paddd $state1_store, $B0
+        paddd $state2_store, $C0
+        paddd $ctr0_store, $D0\n";
+}
+
+sub xor_stream {
+my ($A, $B, $C, $D, $offset)=@_;
+$code.="movdqu 0*16 + $offset($inp), $A3
+        movdqu 1*16 + $offset($inp), $B3
+        movdqu 2*16 + $offset($inp), $C3
+        movdqu 3*16 + $offset($inp), $D3
+        pxor $A3, $A
+        pxor $B3, $B
+        pxor $C3, $C
+        pxor $D, $D3
+        movdqu $A, 0*16 + $offset($oup)
+        movdqu $B, 1*16 + $offset($oup)
+        movdqu $C, 2*16 + $offset($oup)
+        movdqu $D3, 3*16 + $offset($oup)\n";
+}
+
+sub xor_stream_using_temp {
+my ($A, $B, $C, $D, $offset, $temp)=@_;
+$code.="movdqa $temp, $tmp_store
+        movdqu 0*16 + $offset($inp), $temp
+        pxor $A, $temp
+        movdqu $temp, 0*16 + $offset($oup)
+        movdqu 1*16 + $offset($inp), $temp
+        pxor $B, $temp
+        movdqu $temp, 1*16 + $offset($oup)
+        movdqu 2*16 + $offset($inp), $temp
+        pxor $C, $temp
+        movdqu $temp, 2*16 + $offset($oup)
+        movdqu 3*16 + $offset($inp), $temp
+        pxor $D, $temp
+        movdqu $temp, 3*16 + $offset($oup)\n";
+}
+
+sub gen_chacha_round {
+my ($rot1, $rot2, $shift)=@_;
+my $round="";
+$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
+$round.="movdqa $rot2, $C0
+         paddd $B3, $A3
+         paddd $B2, $A2
+         paddd $B1, $A1
+         paddd $B0, $A0
+         pxor $A3, $D3
+         pxor $A2, $D2
+         pxor $A1, $D1
+         pxor $A0, $D0
+         pshufb $C0, $D3
+         pshufb $C0, $D2
+         pshufb $C0, $D1
+         pshufb $C0, $D0
+         movdqa $tmp_store, $C0
+         paddd $D3, $C3
+         paddd $D2, $C2
+         paddd $D1, $C1
+         paddd $D0, $C0
+         pxor $C3, $B3
+         pxor $C2, $B2
+         pxor $C1, $B1
+         pxor $C0, $B0
+         movdqa $C0, $tmp_store
+         movdqa $B3, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B3
+         pxor $C0, $B3
+         movdqa $B2, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B2
+         pxor $C0, $B2
+         movdqa $B1, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B1
+         pxor $C0, $B1
+         movdqa $B0, $C0
+         psrld \$$rot1, $C0
+         pslld \$32-$rot1, $B0
+         pxor $C0, $B0\n";
+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
+$round.="movdqa $tmp_store, $C0
+         palignr \$$s1, $B3, $B3
+         palignr \$$s2, $C3, $C3
+         palignr \$$s3, $D3, $D3
+         palignr \$$s1, $B2, $B2
+         palignr \$$s2, $C2, $C2
+         palignr \$$s3, $D2, $D2
+         palignr \$$s1, $B1, $B1
+         palignr \$$s2, $C1, $C1
+         palignr \$$s3, $D1, $D1
+         palignr \$$s1, $B0, $B0
+         palignr \$$s2, $C0, $C0
+         palignr \$$s3, $D0, $D0\n"
+if (($shift =~ /left/) || ($shift =~ /right/));
+return $round;
+};
+
+$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
+               &gen_chacha_round(25, ".rol8(%rip)", "left") .
+               &gen_chacha_round(20, ".rol16(%rip)") .
+               &gen_chacha_round(25, ".rol8(%rip)", "right");
+
+my @loop_body = split /\n/, $chacha_body;
+
+sub emit_body {
+my ($n)=@_;
+    for (my $i=0; $i < $n; $i++) {
+        $code=$code.shift(@loop_body)."\n";
+    };
+}
+
+{
+################################################################################
+# void poly_hash_ad_internal();
+$code.="
+.type poly_hash_ad_internal,\@function,2
+.align 64
+poly_hash_ad_internal:
+.cfi_startproc
+    xor $acc0, $acc0
+    xor $acc1, $acc1
+    xor $acc2, $acc2
+    cmp \$13,  $itr2
+    jne hash_ad_loop
+poly_fast_tls_ad:
+    # Special treatment for the TLS case of 13 bytes
+    mov ($adp), $acc0
+    mov 5($adp), $acc1
+    shr \$24, $acc1
+    mov \$1, $acc2\n";
+    &poly_mul(); $code.="
+    ret
+hash_ad_loop:
+        # Hash in 16 byte chunk
+        cmp \$16, $itr2
+        jb hash_ad_tail\n";
+        &poly_add("0($adp)");
+        &poly_mul(); $code.="
+        lea (1*16)($adp), $adp
+        sub \$16, $itr2
+    jmp hash_ad_loop
+hash_ad_tail:
+    cmp \$0, $itr2
+    je 1f
+    # Hash last < 16 byte tail
+    xor $t0, $t0
+    xor $t1, $t1
+    xor $t2, $t2
+    add $itr2, $adp
+hash_ad_tail_loop:
+        shld \$8, $t0, $t1
+        shl \$8, $t0
+        movzxb -1($adp), $t2
+        xor $t2, $t0
+        dec $adp
+        dec $itr2
+    jne hash_ad_tail_loop
+
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+    # Finished AD
+1:
+    ret
+.cfi_endproc
+.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
+}
+
+{
+################################################################################
+# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
+$code.="
+.globl chacha20_poly1305_open
+.type chacha20_poly1305_open,\@function,2
+.align 64
+chacha20_poly1305_open:
+.cfi_startproc
+    push %rbp
+.cfi_adjust_cfa_offset 8
+    push %rbx
+.cfi_adjust_cfa_offset 8
+    push %r12
+.cfi_adjust_cfa_offset 8
+    push %r13
+.cfi_adjust_cfa_offset 8
+    push %r14
+.cfi_adjust_cfa_offset 8
+    push %r15
+.cfi_adjust_cfa_offset 8
+    # We write the calculated authenticator back to keyp at the end, so save
+    # the pointer on the stack too.
+    push $keyp
+.cfi_adjust_cfa_offset 8
+    sub \$288 + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+.cfi_offset rbp, -16
+.cfi_offset rbx, -24
+.cfi_offset r12, -32
+.cfi_offset r13, -40
+.cfi_offset r14, -48
+.cfi_offset r15, -56
+.cfi_offset $keyp, -64
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp
+    mov %rdx, 8+$len_store
+    mov %r8, 0+$len_store
+    mov %rdx, $inl\n"; $code.="
+    mov OPENSSL_ia32cap_P+8(%rip), %eax
+    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
+    xor \$`(1<<5) + (1<<8)`, %eax
+    jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
+$code.="
+1:
+    cmp \$128, $inl
+    jbe open_sse_128
+    # For long buffers, prepare the poly key first
+    movdqa .chacha20_consts(%rip), $A0
+    movdqu 0*16($keyp), $B0
+    movdqu 1*16($keyp), $C0
+    movdqu 2*16($keyp), $D0
+    movdqa $D0, $T1
+    # Store on stack, to free keyp
+    movdqa $B0, $state1_store
+    movdqa $C0, $state2_store
+    movdqa $D0, $ctr0_store
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        dec $acc0
+    jne 1b
+    # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
+    paddd .chacha20_consts(%rip), $A0
+    paddd $state1_store, $B0
+    # Clamp and store the key
+    pand .clamp(%rip), $A0
+    movdqa $A0, $r_store
+    movdqa $B0, $s_store
+    # Hash
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+open_sse_main_loop:
+        cmp \$16*16, $inl
+        jb 2f
+        # Load state, increment counter blocks\n";
+        &prep_state(4); $code.="
+        # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
+        # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
+        mov \$4, $itr1
+        mov $inp, $itr2
+1:  \n";
+            &emit_body(20);
+            &poly_add("0($itr2)"); $code.="
+            lea 2*8($itr2), $itr2\n";
+            &emit_body(20);
+            &poly_stage1();
+            &emit_body(20);
+            &poly_stage2();
+            &emit_body(20);
+            &poly_stage3();
+            &emit_body(20);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            dec $itr1
+        jge 1b\n";
+            &poly_add("0($itr2)");
+            &poly_mul(); $code.="
+            lea 2*8($itr2), $itr2
+            cmp \$-6, $itr1
+        jg 1b\n";
+        &finalize_state(4);
+        &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
+        &xor_stream($A2, $B2, $C2, $D2, "4*16");
+        &xor_stream($A1, $B1, $C1, $D1, "8*16");
+        &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
+        lea 16*16($inp), $inp
+        lea 16*16($oup), $oup
+        sub \$16*16, $inl
+    jmp open_sse_main_loop
+2:
+    # Handle the various tail sizes efficiently
+    test $inl, $inl
+    jz open_sse_finalize
+    cmp \$4*16, $inl
+    ja 3f\n";
+###############################################################################
+    # At most 64 bytes are left
+    &prep_state(1); $code.="
+    xor $itr2, $itr2
+    mov $inl, $itr1
+    cmp \$16, $itr1
+    jb 2f
+1:  \n";
+        &poly_add("0($inp, $itr2)");
+        &poly_mul(); $code.="
+        sub \$16, $itr1
+2:
+        add \$16, $itr2\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        cmp \$16, $itr1
+    jae 1b
+        cmp \$10*16, $itr2
+    jne 2b\n";
+    &finalize_state(1); $code.="
+    jmp open_sse_tail_64_dec_loop
+3:
+    cmp \$8*16, $inl
+    ja 3f\n";
+###############################################################################
+    # 65 - 128 bytes are left
+    &prep_state(2); $code.="
+    mov $inl, $itr1
+    and \$-16, $itr1
+    xor $itr2, $itr2
+1:  \n";
+        &poly_add("0($inp, $itr2)");
+        &poly_mul(); $code.="
+2:
+        add \$16, $itr2\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
+        cmp $itr1, $itr2
+    jb 1b
+        cmp \$10*16, $itr2
+    jne 2b\n";
+    &finalize_state(2);
+    &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
+    sub \$4*16, $inl
+    lea 4*16($inp), $inp
+    lea 4*16($oup), $oup
+    jmp open_sse_tail_64_dec_loop
+3:
+    cmp \$12*16, $inl
+    ja 3f\n";
+###############################################################################
+    # 129 - 192 bytes are left
+    &prep_state(3); $code.="
+    mov $inl, $itr1
+    mov \$10*16, $itr2
+    cmp \$10*16, $itr1
+    cmovg $itr2, $itr1
+    and \$-16, $itr1
+    xor $itr2, $itr2
+1:  \n";
+        &poly_add("0($inp, $itr2)");
+        &poly_mul(); $code.="
+2:
+        add \$16, $itr2\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb 1b
+        cmp \$10*16, $itr2
+    jne 2b
+    cmp \$11*16, $inl
+    jb 1f\n";
+    &poly_add("10*16($inp)");
+    &poly_mul(); $code.="
+    cmp \$12*16, $inl
+    jb 1f\n";
+    &poly_add("11*16($inp)");
+    &poly_mul(); $code.="
+1:  \n";
+    &finalize_state(3);
+    &xor_stream($A2, $B2, $C2, $D2, "0*16");
+    &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
+    sub \$8*16, $inl
+    lea 8*16($inp), $inp
+    lea 8*16($oup), $oup
+    jmp open_sse_tail_64_dec_loop
+3:
+###############################################################################\n";
+    # 193 - 255 bytes are left
+    &prep_state(4); $code.="
+    xor $itr2, $itr2
+1:  \n";
+        &poly_add("0($inp, $itr2)");
+        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
+        &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
+        &poly_stage1();
+        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
+        &poly_stage2();
+        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
+        &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
+        &poly_stage3();
+        &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
+        &poly_reduce_stage();
+        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
+        add \$16, $itr2
+        cmp \$10*16, $itr2
+    jb 1b
+    mov $inl, $itr1
+    and \$-16, $itr1
+1:  \n";
+        &poly_add("0($inp, $itr2)");
+        &poly_mul(); $code.="
+        add \$16, $itr2
+        cmp $itr1, $itr2
+    jb 1b\n";
+    &finalize_state(4);
+    &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
+    &xor_stream($A2, $B2, $C2, $D2, "4*16");
+    &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
+    movdqa $tmp_store, $D0
+    sub \$12*16, $inl
+    lea 12*16($inp), $inp
+    lea 12*16($oup), $oup
+###############################################################################
+    # Decrypt the remaining data, 16B at a time, using existing stream
+open_sse_tail_64_dec_loop:
+    cmp \$16, $inl
+    jb 1f
+        sub \$16, $inl
+        movdqu ($inp), $T0
+        pxor $T0, $A0
+        movdqu $A0, ($oup)
+        lea 16($inp), $inp
+        lea 16($oup), $oup
+        movdqa $B0, $A0
+        movdqa $C0, $B0
+        movdqa $D0, $C0
+    jmp open_sse_tail_64_dec_loop
+1:
+    movdqa $A0, $A1
+
+    # Decrypt up to 16 bytes at the end.
+open_sse_tail_16:
+    test $inl, $inl
+    jz open_sse_finalize
+
+    # Read the final bytes into $T0. They need to be read in reverse order so
+    # that they end up in the correct order in $T0.
+    pxor $T0, $T0
+    lea -1($inp, $inl), $inp
+    movq $inl, $itr2
+2:
+        pslldq \$1, $T0
+        pinsrb \$0, ($inp), $T0
+        sub \$1, $inp
+        sub \$1, $itr2
+        jnz 2b
+
+3:
+    movq $T0, $t0
+    pextrq \$1, $T0, $t1
+    # The final bytes of keystream are in $A1.
+    pxor $A1, $T0
+
+    # Copy the plaintext bytes out.
+2:
+        pextrb \$0, $T0, ($oup)
+        psrldq \$1, $T0
+        add \$1, $oup
+        sub \$1, $inl
+    jne 2b
+
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+
+open_sse_finalize:\n";
+    &poly_add($len_store);
+    &poly_mul(); $code.="
+    # Final reduce
+    mov $acc0, $t0
+    mov $acc1, $t1
+    mov $acc2, $t2
+    sub \$-5, $acc0
+    sbb \$-1, $acc1
+    sbb \$3, $acc2
+    cmovc $t0, $acc0
+    cmovc $t1, $acc1
+    cmovc $t2, $acc2
+    # Add in s part of the key
+    add 0+$s_store, $acc0
+    adc 8+$s_store, $acc1
+
+    add \$288 + 32, %rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+    pop $keyp
+.cfi_adjust_cfa_offset -8
+    movq $acc0, ($keyp)
+    movq $acc1, 8($keyp)
+
+    pop %r15
+.cfi_adjust_cfa_offset -8
+    pop %r14
+.cfi_adjust_cfa_offset -8
+    pop %r13
+.cfi_adjust_cfa_offset -8
+    pop %r12
+.cfi_adjust_cfa_offset -8
+    pop %rbx
+.cfi_adjust_cfa_offset -8
+    pop %rbp
+.cfi_adjust_cfa_offset -8
+    ret
+.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+###############################################################################
+open_sse_128:
+    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
+    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
+    movdqu 2*16($keyp), $D0
+    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
+    movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
+    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+    dec $acc0
+    jnz 1b
+    paddd .chacha20_consts(%rip), $A0
+    paddd .chacha20_consts(%rip), $A1
+    paddd .chacha20_consts(%rip), $A2
+    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
+    paddd $T2, $C1\npaddd $T2, $C2
+    paddd $T3, $D1
+    paddd .sse_inc(%rip), $T3
+    paddd $T3, $D2
+    # Clamp and store the key
+    pand .clamp(%rip), $A0
+    movdqa $A0, $r_store
+    movdqa $B0, $s_store
+    # Hash
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+1:
+        cmp \$16, $inl
+        jb open_sse_tail_16
+        sub \$16, $inl\n";
+        # Load for hashing
+        &poly_add("0*8($inp)"); $code.="
+        # Load for decryption
+        movdqu 0*16($inp), $T0
+        pxor $T0, $A1
+        movdqu $A1, 0*16($oup)
+        lea 1*16($inp), $inp
+        lea 1*16($oup), $oup\n";
+        &poly_mul(); $code.="
+        # Shift the stream left
+        movdqa $B1, $A1
+        movdqa $C1, $B1
+        movdqa $D1, $C1
+        movdqa $A2, $D1
+        movdqa $B2, $A2
+        movdqa $C2, $B2
+        movdqa $D2, $C2
+    jmp 1b
+    jmp open_sse_tail_16
+.size chacha20_poly1305_open, .-chacha20_poly1305_open
+.cfi_endproc
+
+################################################################################
+################################################################################
+# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
+.globl  chacha20_poly1305_seal
+.type chacha20_poly1305_seal,\@function,2
+.align 64
+chacha20_poly1305_seal:
+.cfi_startproc
+    push %rbp
+.cfi_adjust_cfa_offset 8
+    push %rbx
+.cfi_adjust_cfa_offset 8
+    push %r12
+.cfi_adjust_cfa_offset 8
+    push %r13
+.cfi_adjust_cfa_offset 8
+    push %r14
+.cfi_adjust_cfa_offset 8
+    push %r15
+.cfi_adjust_cfa_offset 8
+    # We write the calculated authenticator back to keyp at the end, so save
+    # the pointer on the stack too.
+    push $keyp
+.cfi_adjust_cfa_offset 8
+    sub \$288 + 32, %rsp
+.cfi_adjust_cfa_offset 288 + 32
+.cfi_offset rbp, -16
+.cfi_offset rbx, -24
+.cfi_offset r12, -32
+.cfi_offset r13, -40
+.cfi_offset r14, -48
+.cfi_offset r15, -56
+.cfi_offset $keyp, -64
+    lea 32(%rsp), %rbp
+    and \$-32, %rbp
+    mov %rdx, 8+$len_store
+    mov %r8, 0+$len_store
+    mov %rdx, $inl\n"; $code.="
+    mov OPENSSL_ia32cap_P+8(%rip), %eax
+    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
+    xor \$`(1<<5) + (1<<8)`, %eax
+    jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
+$code.="
+    cmp \$128, $inl
+    jbe seal_sse_128
+    # For longer buffers, prepare the poly key + some stream
+    movdqa .chacha20_consts(%rip), $A0
+    movdqu 0*16($keyp), $B0
+    movdqu 1*16($keyp), $C0
+    movdqu 2*16($keyp), $D0
+    movdqa $A0, $A1
+    movdqa $A0, $A2
+    movdqa $A0, $A3
+    movdqa $B0, $B1
+    movdqa $B0, $B2
+    movdqa $B0, $B3
+    movdqa $C0, $C1
+    movdqa $C0, $C2
+    movdqa $C0, $C3
+    movdqa $D0, $D3
+    paddd .sse_inc(%rip), $D0
+    movdqa $D0, $D2
+    paddd .sse_inc(%rip), $D0
+    movdqa $D0, $D1
+    paddd .sse_inc(%rip), $D0
+    # Store on stack
+    movdqa $B0, $state1_store
+    movdqa $C0, $state2_store
+    movdqa $D0, $ctr0_store
+    movdqa $D1, $ctr1_store
+    movdqa $D2, $ctr2_store
+    movdqa $D3, $ctr3_store
+    mov \$10, $acc0
+1:  \n";
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        dec $acc0
+    jnz 1b\n";
+    &finalize_state(4); $code.="
+    # Clamp and store the key
+    pand .clamp(%rip), $A3
+    movdqa $A3, $r_store
+    movdqa $B3, $s_store
+    # Hash
+    mov %r8, $itr2
+    call poly_hash_ad_internal\n";
+    &xor_stream($A2,$B2,$C2,$D2,"0*16");
+    &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
+    cmp \$12*16, $inl
+    ja 1f
+    mov \$8*16, $itr1
+    sub \$8*16, $inl
+    lea 8*16($inp), $inp
+    jmp seal_sse_128_seal_hash
+1:  \n";
+    &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
+    mov \$12*16, $itr1
+    sub \$12*16, $inl
+    lea 12*16($inp), $inp
+    mov \$2, $itr1
+    mov \$8, $itr2
+    cmp \$4*16, $inl
+    jbe seal_sse_tail_64
+    cmp \$8*16, $inl
+    jbe seal_sse_tail_128
+    cmp \$12*16, $inl
+    jbe seal_sse_tail_192
+
+1:  \n";
+    # The main loop
+        &prep_state(4); $code.="
+2:  \n";
+            &emit_body(20);
+            &poly_add("0($oup)");
+            &emit_body(20);
+            &poly_stage1();
+            &emit_body(20);
+            &poly_stage2();
+            &emit_body(20);
+            &poly_stage3();
+            &emit_body(20);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            lea 16($oup), $oup
+            dec $itr2
+        jge 2b\n";
+            &poly_add("0*8($oup)");
+            &poly_mul(); $code.="
+            lea 16($oup), $oup
+            dec $itr1
+        jg 2b\n";
+
+        &finalize_state(4);$code.="
+        movdqa $D2, $tmp_store\n";
+        &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
+        movdqa $tmp_store, $D2\n";
+        &xor_stream($A2,$B2,$C2,$D2, 4*16);
+        &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
+        cmp \$16*16, $inl
+        ja 3f
+
+        mov \$12*16, $itr1
+        sub \$12*16, $inl
+        lea 12*16($inp), $inp
+        jmp seal_sse_128_seal_hash
+3:  \n";
+        &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
+        lea 16*16($inp), $inp
+        sub \$16*16, $inl
+        mov \$6, $itr1
+        mov \$4, $itr2
+        cmp \$12*16, $inl
+    jg 1b
+    mov $inl, $itr1
+    test $inl, $inl
+    je seal_sse_128_seal_hash
+    mov \$6, $itr1
+    cmp \$4*16, $inl
+    jg 3f
+###############################################################################
+seal_sse_tail_64:\n";
+    &prep_state(1); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+2:  \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+    dec $itr1
+    jg 1b
+    dec $itr2
+    jge 2b\n";
+    &finalize_state(1); $code.="
+    jmp seal_sse_128_seal
+3:
+    cmp \$8*16, $inl
+    jg 3f
+###############################################################################
+seal_sse_tail_128:\n";
+    &prep_state(2); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+2:  \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &poly_add("0($oup)");
+        &poly_mul();
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+        lea 16($oup), $oup
+    dec $itr1
+    jg 1b
+    dec $itr2
+    jge 2b\n";
+    &finalize_state(2);
+    &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
+    mov \$4*16, $itr1
+    sub \$4*16, $inl
+    lea 4*16($inp), $inp
+    jmp seal_sse_128_seal_hash
+3:
+###############################################################################
+seal_sse_tail_192:\n";
+    &prep_state(3); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 16($oup), $oup
+2:  \n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &poly_add("0($oup)");
+        &poly_mul();
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        lea 16($oup), $oup
+    dec $itr1
+    jg 1b
+    dec $itr2
+    jge 2b\n";
+    &finalize_state(3);
+    &xor_stream($A2,$B2,$C2,$D2,0*16);
+    &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
+    mov \$8*16, $itr1
+    sub \$8*16, $inl
+    lea 8*16($inp), $inp
+###############################################################################
+seal_sse_128_seal_hash:
+        cmp \$16, $itr1
+        jb seal_sse_128_seal\n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        sub \$16, $itr1
+        lea 16($oup), $oup
+    jmp seal_sse_128_seal_hash
+
+seal_sse_128_seal:
+        cmp \$16, $inl
+        jb seal_sse_tail_16
+        sub \$16, $inl
+        # Load for decryption
+        movdqu 0*16($inp), $T0
+        pxor $T0, $A0
+        movdqu $A0, 0*16($oup)
+        # Then hash
+        add 0*8($oup), $acc0
+        adc 1*8($oup), $acc1
+        adc \$1, $acc2
+        lea 1*16($inp), $inp
+        lea 1*16($oup), $oup\n";
+        &poly_mul(); $code.="
+        # Shift the stream left
+        movdqa $B0, $A0
+        movdqa $C0, $B0
+        movdqa $D0, $C0
+        movdqa $A1, $D0
+        movdqa $B1, $A1
+        movdqa $C1, $B1
+        movdqa $D1, $C1
+    jmp seal_sse_128_seal
+
+seal_sse_tail_16:
+    test $inl, $inl
+    jz seal_sse_finalize
+    # We can only load the PT one byte at a time to avoid buffer overread
+    mov $inl, $itr2
+    shl \$4, $itr2
+    lea .and_masks(%rip), $t0
+    mov $inl, $itr1
+    lea -1($inp, $inl), $inp
+    pxor $T3, $T3
+1:
+        pslldq \$1, $T3
+        pinsrb \$0, ($inp), $T3
+        lea -1($inp), $inp
+        dec $itr1
+    jne 1b
+
+    # XOR the keystream with the plaintext.
+    pxor $A0, $T3
+
+    # Write ciphertext out, byte-by-byte.
+    movq $inl, $itr1
+    movdqu $T3, $A0
+2:
+        pextrb \$0, $A0, ($oup)
+        psrldq \$1, $A0
+        add \$1, $oup
+        sub \$1, $itr1
+        jnz 2b
+
+    pand -16($t0, $itr2), $T3
+    movq $T3, $t0
+    pextrq \$1, $T3, $t1
+    add $t0, $acc0
+    adc $t1, $acc1
+    adc \$1, $acc2\n";
+    &poly_mul(); $code.="
+seal_sse_finalize:\n";
+    &poly_add($len_store);
+    &poly_mul(); $code.="
+    # Final reduce
+    mov $acc0, $t0
+    mov $acc1, $t1
+    mov $acc2, $t2
+    sub \$-5, $acc0
+    sbb \$-1, $acc1
+    sbb \$3, $acc2
+    cmovc $t0, $acc0
+    cmovc $t1, $acc1
+    cmovc $t2, $acc2
+    # Add in s part of the key
+    add 0+$s_store, $acc0
+    adc 8+$s_store, $acc1
+
+    add \$288 + 32, %rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+    pop $keyp
+.cfi_adjust_cfa_offset -8
+    mov $acc0, 0*8($keyp)
+    mov $acc1, 1*8($keyp)
+
+    pop %r15
+.cfi_adjust_cfa_offset -8
+    pop %r14
+.cfi_adjust_cfa_offset -8
+    pop %r13
+.cfi_adjust_cfa_offset -8
+    pop %r12
+.cfi_adjust_cfa_offset -8
+    pop %rbx
+.cfi_adjust_cfa_offset -8
+    pop %rbp
+.cfi_adjust_cfa_offset -8
+    ret
+.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
+################################################################################
+seal_sse_128:
+    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
+    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
+    movdqu 2*16($keyp), $D2
+    movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
+    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
+    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
+    mov \$10, $acc0
+1:\n";
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        dec $acc0
+    jnz 1b
+    paddd .chacha20_consts(%rip), $A0
+    paddd .chacha20_consts(%rip), $A1
+    paddd .chacha20_consts(%rip), $A2
+    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
+    paddd $T2, $C0\npaddd $T2, $C1
+    paddd $T3, $D0
+    paddd .sse_inc(%rip), $T3
+    paddd $T3, $D1
+    # Clamp and store the key
+    pand .clamp(%rip), $A2
+    movdqa $A2, $r_store
+    movdqa $B2, $s_store
+    # Hash
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+    jmp seal_sse_128_seal
+.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
+}
+
+# There should have been a cfi_endproc at the end of that function, but the two
+# following blocks of code are jumped to without a stack frame and the CFI
+# context which they are used in happens to match the CFI context at the end of
+# the previous function. So the CFI table is just extended to the end of them.
+
+if ($avx>1) {
+
+($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
+my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
+($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
+$state1_store="2*32(%rbp)";
+$state2_store="3*32(%rbp)";
+$tmp_store="4*32(%rbp)";
+$ctr0_store="5*32(%rbp)";
+$ctr1_store="6*32(%rbp)";
+$ctr2_store="7*32(%rbp)";
+$ctr3_store="8*32(%rbp)";
+
+sub chacha_qr_avx2 {
+my ($a,$b,$c,$d,$t,$dir)=@_;
+$code.=<<___ if ($dir =~ /store/);
+    vmovdqa $t, $tmp_store
+___
+$code.=<<___;
+    vpaddd $b, $a, $a
+    vpxor $a, $d, $d
+    vpshufb .rol16(%rip), $d, $d
+    vpaddd $d, $c, $c
+    vpxor $c, $b, $b
+    vpsrld \$20, $b, $t
+    vpslld \$12, $b, $b
+    vpxor $t, $b, $b
+    vpaddd $b, $a, $a
+    vpxor $a, $d, $d
+    vpshufb .rol8(%rip), $d, $d
+    vpaddd $d, $c, $c
+    vpxor $c, $b, $b
+    vpslld \$7, $b, $t
+    vpsrld \$25, $b, $b
+    vpxor $t, $b, $b
+___
+$code.=<<___ if ($dir =~ /left/);
+    vpalignr \$12, $d, $d, $d
+    vpalignr \$8, $c, $c, $c
+    vpalignr \$4, $b, $b, $b
+___
+$code.=<<___ if ($dir =~ /right/);
+    vpalignr \$4, $d, $d, $d
+    vpalignr \$8, $c, $c, $c
+    vpalignr \$12, $b, $b, $b
+___
+$code.=<<___ if ($dir =~ /load/);
+    vmovdqa $tmp_store, $t
+___
+}
+
+sub prep_state_avx2 {
+my ($n)=@_;
+$code.=<<___;
+    vmovdqa .chacha20_consts(%rip), $A0
+    vmovdqa $state1_store, $B0
+    vmovdqa $state2_store, $C0
+___
+$code.=<<___ if ($n ge 2);
+    vmovdqa $A0, $A1
+    vmovdqa $B0, $B1
+    vmovdqa $C0, $C1
+___
+$code.=<<___ if ($n ge 3);
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C2
+___
+$code.=<<___ if ($n ge 4);
+    vmovdqa $A0, $A3
+    vmovdqa $B0, $B3
+    vmovdqa $C0, $C3
+___
+$code.=<<___ if ($n eq 1);
+    vmovdqa .avx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D0
+    vmovdqa $D0, $ctr0_store
+___
+$code.=<<___ if ($n eq 2);
+    vmovdqa .avx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D1
+    vpaddd $D1, $D0, $D0
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+___
+$code.=<<___ if ($n eq 3);
+    vmovdqa .avx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D2
+    vpaddd $D2, $D0, $D1
+    vpaddd $D1, $D0, $D0
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+___
+$code.=<<___ if ($n eq 4);
+    vmovdqa .avx2_inc(%rip), $D0
+    vpaddd $ctr0_store, $D0, $D3
+    vpaddd $D3, $D0, $D2
+    vpaddd $D2, $D0, $D1
+    vpaddd $D1, $D0, $D0
+    vmovdqa $D3, $ctr3_store
+    vmovdqa $D2, $ctr2_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D0, $ctr0_store
+___
+}
+
+sub finalize_state_avx2 {
+my ($n)=@_;
+$code.=<<___ if ($n eq 4);
+    vpaddd .chacha20_consts(%rip), $A3, $A3
+    vpaddd $state1_store, $B3, $B3
+    vpaddd $state2_store, $C3, $C3
+    vpaddd $ctr3_store, $D3, $D3
+___
+$code.=<<___ if ($n ge 3);
+    vpaddd .chacha20_consts(%rip), $A2, $A2
+    vpaddd $state1_store, $B2, $B2
+    vpaddd $state2_store, $C2, $C2
+    vpaddd $ctr2_store, $D2, $D2
+___
+$code.=<<___ if ($n ge 2);
+    vpaddd .chacha20_consts(%rip), $A1, $A1
+    vpaddd $state1_store, $B1, $B1
+    vpaddd $state2_store, $C1, $C1
+    vpaddd $ctr1_store, $D1, $D1
+___
+$code.=<<___;
+    vpaddd .chacha20_consts(%rip), $A0, $A0
+    vpaddd $state1_store, $B0, $B0
+    vpaddd $state2_store, $C0, $C0
+    vpaddd $ctr0_store, $D0, $D0
+___
+}
+
+sub xor_stream_avx2 {
+my ($A, $B, $C, $D, $offset, $hlp)=@_;
+$code.=<<___;
+    vperm2i128 \$0x02, $A, $B, $hlp
+    vperm2i128 \$0x13, $A, $B, $B
+    vperm2i128 \$0x02, $C, $D, $A
+    vperm2i128 \$0x13, $C, $D, $C
+    vpxor 0*32+$offset($inp), $hlp, $hlp
+    vpxor 1*32+$offset($inp), $A, $A
+    vpxor 2*32+$offset($inp), $B, $B
+    vpxor 3*32+$offset($inp), $C, $C
+    vmovdqu $hlp, 0*32+$offset($oup)
+    vmovdqu $A, 1*32+$offset($oup)
+    vmovdqu $B, 2*32+$offset($oup)
+    vmovdqu $C, 3*32+$offset($oup)
+___
+}
+
+sub finish_stream_avx2 {
+my ($A, $B, $C, $D, $hlp)=@_;
+$code.=<<___;
+    vperm2i128 \$0x13, $A, $B, $hlp
+    vperm2i128 \$0x02, $A, $B, $A
+    vperm2i128 \$0x02, $C, $D, $B
+    vperm2i128 \$0x13, $C, $D, $D
+    vmovdqa $hlp, $C
+___
+}
+
+sub poly_stage1_mulx {
+$code.=<<___;
+    mov 0+$r_store, %rdx
+    mov %rdx, $t2
+    mulx $acc0, $t0, $t1
+    mulx $acc1, %rax, %rdx
+    imul $acc2, $t2
+    add %rax, $t1
+    adc %rdx, $t2
+___
+}
+
+sub poly_stage2_mulx {
+$code.=<<___;
+    mov 8+$r_store, %rdx
+    mulx $acc0, $acc0, %rax
+    add $acc0, $t1
+    mulx $acc1, $acc1, $t3
+    adc $acc1, $t2
+    adc \$0, $t3
+    imul $acc2, %rdx
+___
+}
+
+sub poly_stage3_mulx {
+$code.=<<___;
+    add %rax, $t2
+    adc %rdx, $t3
+___
+}
+
+sub poly_mul_mulx {
+    &poly_stage1_mulx();
+    &poly_stage2_mulx();
+    &poly_stage3_mulx();
+    &poly_reduce_stage();
+}
+
+sub gen_chacha_round_avx2 {
+my ($rot1, $rot2, $shift)=@_;
+my $round="";
+$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
+$round=$round ."vmovdqa $rot2, $C0
+                vpaddd $B3, $A3, $A3
+                vpaddd $B2, $A2, $A2
+                vpaddd $B1, $A1, $A1
+                vpaddd $B0, $A0, $A0
+                vpxor $A3, $D3, $D3
+                vpxor $A2, $D2, $D2
+                vpxor $A1, $D1, $D1
+                vpxor $A0, $D0, $D0
+                vpshufb $C0, $D3, $D3
+                vpshufb $C0, $D2, $D2
+                vpshufb $C0, $D1, $D1
+                vpshufb $C0, $D0, $D0
+                vmovdqa $tmp_store, $C0
+                vpaddd $D3, $C3, $C3
+                vpaddd $D2, $C2, $C2
+                vpaddd $D1, $C1, $C1
+                vpaddd $D0, $C0, $C0
+                vpxor $C3, $B3, $B3
+                vpxor $C2, $B2, $B2
+                vpxor $C1, $B1, $B1
+                vpxor $C0, $B0, $B0
+                vmovdqa $C0, $tmp_store
+                vpsrld \$$rot1, $B3, $C0
+                vpslld \$32-$rot1, $B3, $B3
+                vpxor $C0, $B3, $B3
+                vpsrld \$$rot1, $B2, $C0
+                vpslld \$32-$rot1, $B2, $B2
+                vpxor $C0, $B2, $B2
+                vpsrld \$$rot1, $B1, $C0
+                vpslld \$32-$rot1, $B1, $B1
+                vpxor $C0, $B1, $B1
+                vpsrld \$$rot1, $B0, $C0
+                vpslld \$32-$rot1, $B0, $B0
+                vpxor $C0, $B0, $B0\n";
+($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
+($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
+$round=$round ."vmovdqa $tmp_store, $C0
+                vpalignr \$$s1, $B3, $B3, $B3
+                vpalignr \$$s2, $C3, $C3, $C3
+                vpalignr \$$s3, $D3, $D3, $D3
+                vpalignr \$$s1, $B2, $B2, $B2
+                vpalignr \$$s2, $C2, $C2, $C2
+                vpalignr \$$s3, $D2, $D2, $D2
+                vpalignr \$$s1, $B1, $B1, $B1
+                vpalignr \$$s2, $C1, $C1, $C1
+                vpalignr \$$s3, $D1, $D1, $D1
+                vpalignr \$$s1, $B0, $B0, $B0
+                vpalignr \$$s2, $C0, $C0, $C0
+                vpalignr \$$s3, $D0, $D0, $D0\n"
+if (($shift =~ /left/) || ($shift =~ /right/));
+return $round;
+};
+
+$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
+               &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
+               &gen_chacha_round_avx2(20, ".rol16(%rip)") .
+               &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
+
+@loop_body = split /\n/, $chacha_body;
+
+$code.="
+###############################################################################
+.type chacha20_poly1305_open_avx2,\@function,2
+.align 64
+chacha20_poly1305_open_avx2:
+    vzeroupper
+    vmovdqa .chacha20_consts(%rip), $A0
+    vbroadcasti128 0*16($keyp), $B0
+    vbroadcasti128 1*16($keyp), $C0
+    vbroadcasti128 2*16($keyp), $D0
+    vpaddd .avx2_init(%rip), $D0, $D0
+    cmp \$6*32, $inl
+    jbe open_avx2_192
+    cmp \$10*32, $inl
+    jbe open_avx2_320
+
+    vmovdqa $B0, $state1_store
+    vmovdqa $C0, $state2_store
+    vmovdqa $D0, $ctr0_store
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        dec $acc0
+    jne 1b
+    vpaddd .chacha20_consts(%rip), $A0, $A0
+    vpaddd $state1_store, $B0, $B0
+    vpaddd $state2_store, $C0, $C0
+    vpaddd $ctr0_store, $D0, $D0
+
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store key
+    vpand .clamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for the first 64 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    # Hash AD + first 64 bytes
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+    xor $itr1, $itr1
+    # Hash first 64 bytes
+1:  \n";
+       &poly_add("0($inp, $itr1)");
+       &poly_mul(); $code.="
+       add \$16, $itr1
+       cmp \$2*32, $itr1
+    jne 1b
+    # Decrypt first 64 bytes
+    vpxor 0*32($inp), $A0, $A0
+    vpxor 1*32($inp), $B0, $B0
+    vmovdqu $A0, 0*32($oup)
+    vmovdqu $B0, 1*32($oup)
+    lea 2*32($inp), $inp
+    lea 2*32($oup), $oup
+    sub \$2*32, $inl
+1:
+        # Hash and decrypt 512 bytes each iteration
+        cmp \$16*32, $inl
+        jb 3f\n";
+        &prep_state_avx2(4); $code.="
+        xor $itr1, $itr1
+2:  \n";
+            &poly_add("0*8($inp, $itr1)");
+            &emit_body(10);
+            &poly_stage1_mulx();
+            &emit_body(9);
+            &poly_stage2_mulx();
+            &emit_body(12);
+            &poly_stage3_mulx();
+            &emit_body(10);
+            &poly_reduce_stage();
+            &emit_body(9);
+            &poly_add("2*8($inp, $itr1)");
+            &emit_body(8);
+            &poly_stage1_mulx();
+            &emit_body(18);
+            &poly_stage2_mulx();
+            &emit_body(18);
+            &poly_stage3_mulx();
+            &emit_body(9);
+            &poly_reduce_stage();
+            &emit_body(8);
+            &poly_add("4*8($inp, $itr1)"); $code.="
+            lea 6*8($itr1), $itr1\n";
+            &emit_body(18);
+            &poly_stage1_mulx();
+            &emit_body(8);
+            &poly_stage2_mulx();
+            &emit_body(8);
+            &poly_stage3_mulx();
+            &emit_body(18);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            cmp \$10*6*8, $itr1
+        jne 2b\n";
+        &finalize_state_avx2(4); $code.="
+        vmovdqa $A0, $tmp_store\n";
+        &poly_add("10*6*8($inp)");
+        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+        vmovdqa $tmp_store, $A0\n";
+        &poly_mul();
+        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+        &poly_add("10*6*8+2*8($inp)");
+        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+        &poly_mul();
+        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
+        lea 16*32($inp), $inp
+        lea 16*32($oup), $oup
+        sub \$16*32, $inl
+    jmp 1b
+3:
+    test $inl, $inl
+    vzeroupper
+    je open_sse_finalize
+3:
+    cmp \$4*32, $inl
+    ja 3f\n";
+###############################################################################
+    # 1-128 bytes left
+    &prep_state_avx2(1); $code.="
+    xor $itr2, $itr2
+    mov $inl, $itr1
+    and \$-16, $itr1
+    test $itr1, $itr1
+    je 2f
+1:  \n";
+        &poly_add("0*8($inp, $itr2)");
+        &poly_mul(); $code.="
+2:
+        add \$16, $itr2\n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb 1b
+        cmp \$160, $itr2
+    jne 2b\n";
+    &finalize_state_avx2(1);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    jmp open_avx2_tail_loop
+3:
+    cmp \$8*32, $inl
+    ja 3f\n";
+###############################################################################
+    # 129-256 bytes left
+    &prep_state_avx2(2); $code.="
+    mov $inl, $tmp_store
+    mov $inl, $itr1
+    sub \$4*32, $itr1
+    shr \$4, $itr1
+    mov \$10, $itr2
+    cmp \$10, $itr1
+    cmovg $itr2, $itr1
+    mov $inp, $inl
+    xor $itr2, $itr2
+1:  \n";
+        &poly_add("0*8($inl)");
+        &poly_mul_mulx(); $code.="
+        lea 16($inl), $inl
+2:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
+        inc $itr2\n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb 1b
+        cmp \$10, $itr2
+    jne 2b
+    mov $inl, $itr2
+    sub $inp, $inl
+    mov $inl, $itr1
+    mov $tmp_store, $inl
+1:
+        add \$16, $itr1
+        cmp $inl, $itr1
+        jg 1f\n";
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 16($itr2), $itr2
+    jmp 1b
+1:  \n";
+    &finalize_state_avx2(2);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
+    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
+    lea 4*32($inp), $inp
+    lea 4*32($oup), $oup
+    sub \$4*32, $inl
+    jmp open_avx2_tail_loop
+3:
+    cmp \$12*32, $inl
+    ja 3f\n";
+###############################################################################
+    # 257-383 bytes left
+    &prep_state_avx2(3); $code.="
+    mov $inl, $tmp_store
+    mov $inl, $itr1
+    sub \$8*32, $itr1
+    shr \$4, $itr1
+    add \$6, $itr1
+    mov \$10, $itr2
+    cmp \$10, $itr1
+    cmovg $itr2, $itr1
+    mov $inp, $inl
+    xor $itr2, $itr2
+1:  \n";
+        &poly_add("0*8($inl)");
+        &poly_mul_mulx(); $code.="
+        lea 16($inl), $inl
+2:  \n";
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &poly_add("0*8($inl)");
+        &poly_mul(); $code.="
+        lea 16($inl), $inl
+        inc $itr2\n";
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
+        cmp $itr1, $itr2
+    jb 1b
+        cmp \$10, $itr2
+    jne 2b
+    mov $inl, $itr2
+    sub $inp, $inl
+    mov $inl, $itr1
+    mov $tmp_store, $inl
+1:
+        add \$16, $itr1
+        cmp $inl, $itr1
+        jg 1f\n";
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 16($itr2), $itr2
+    jmp 1b
+1:  \n";
+    &finalize_state_avx2(3);
+    &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
+    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
+    lea 8*32($inp), $inp
+    lea 8*32($oup), $oup
+    sub \$8*32, $inl
+    jmp open_avx2_tail_loop
+3:  \n";
+###############################################################################
+    # 384-512 bytes left
+    &prep_state_avx2(4); $code.="
+    xor $itr1, $itr1
+    mov $inp, $itr2
+1:  \n";
+        &poly_add("0*8($itr2)");
+        &poly_mul(); $code.="
+        lea 2*8($itr2), $itr2
+2:  \n";
+        &emit_body(37);
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx();
+        &emit_body(48);
+        &poly_add("2*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 4*8($itr2), $itr2\n";
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        inc $itr1
+        cmp \$4, $itr1
+    jl  1b
+        cmp \$10, $itr1
+    jne 2b
+    mov $inl, $itr1
+    sub \$12*32, $itr1
+    and \$-16, $itr1
+1:
+        test $itr1, $itr1
+        je 1f\n";
+        &poly_add("0*8($itr2)");
+        &poly_mul_mulx(); $code.="
+        lea 2*8($itr2), $itr2
+        sub \$2*8, $itr1
+    jmp 1b
+1:  \n";
+    &finalize_state_avx2(4); $code.="
+    vmovdqa $A0, $tmp_store\n";
+    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+    vmovdqa $tmp_store, $A0\n";
+    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+    &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
+    lea 12*32($inp), $inp
+    lea 12*32($oup), $oup
+    sub \$12*32, $inl
+open_avx2_tail_loop:
+    cmp \$32, $inl
+    jb open_avx2_tail
+        sub \$32, $inl
+        vpxor ($inp), $A0, $A0
+        vmovdqu $A0, ($oup)
+        lea 1*32($inp), $inp
+        lea 1*32($oup), $oup
+        vmovdqa $B0, $A0
+        vmovdqa $C0, $B0
+        vmovdqa $D0, $C0
+    jmp open_avx2_tail_loop
+open_avx2_tail:
+    cmp \$16, $inl
+    vmovdqa $A0x, $A1x
+    jb 1f
+    sub \$16, $inl
+    #load for decryption
+    vpxor ($inp), $A0x, $A1x
+    vmovdqu $A1x, ($oup)
+    lea 1*16($inp), $inp
+    lea 1*16($oup), $oup
+    vperm2i128 \$0x11, $A0, $A0, $A0
+    vmovdqa $A0x, $A1x
+1:
+    vzeroupper
+    jmp open_sse_tail_16
+###############################################################################
+open_avx2_192:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .avx2_inc(%rip), $D0, $D1
+    vmovdqa $D0, $T2
+    vmovdqa $D1, $T3
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+        dec $acc0
+    jne 1b
+    vpaddd $A2, $A0, $A0
+    vpaddd $A2, $A1, $A1
+    vpaddd $B2, $B0, $B0
+    vpaddd $B2, $B1, $B1
+    vpaddd $C2, $C0, $C0
+    vpaddd $C2, $C1, $C1
+    vpaddd $T2, $D0, $D0
+    vpaddd $T3, $D1, $D1
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .clamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 192 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+open_avx2_short:
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+open_avx2_hash_and_xor_loop:
+        cmp \$32, $inl
+        jb open_avx2_short_tail_32
+        sub \$32, $inl\n";
+        # Load + hash
+        &poly_add("0*8($inp)");
+        &poly_mul();
+        &poly_add("2*8($inp)");
+        &poly_mul(); $code.="
+        # Load + decrypt
+        vpxor ($inp), $A0, $A0
+        vmovdqu $A0, ($oup)
+        lea 1*32($inp), $inp
+        lea 1*32($oup), $oup
+        # Shift stream
+        vmovdqa $B0, $A0
+        vmovdqa $C0, $B0
+        vmovdqa $D0, $C0
+        vmovdqa $A1, $D0
+        vmovdqa $B1, $A1
+        vmovdqa $C1, $B1
+        vmovdqa $D1, $C1
+        vmovdqa $A2, $D1
+        vmovdqa $B2, $A2
+    jmp open_avx2_hash_and_xor_loop
+open_avx2_short_tail_32:
+    cmp \$16, $inl
+    vmovdqa $A0x, $A1x
+    jb 1f
+    sub \$16, $inl\n";
+    &poly_add("0*8($inp)");
+    &poly_mul(); $code.="
+    vpxor ($inp), $A0x, $A3x
+    vmovdqu $A3x, ($oup)
+    lea 1*16($inp), $inp
+    lea 1*16($oup), $oup
+    vextracti128 \$1, $A0, $A1x
+1:
+    vzeroupper
+    jmp open_sse_tail_16
+###############################################################################
+open_avx2_320:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .avx2_inc(%rip), $D0, $D1
+    vpaddd .avx2_inc(%rip), $D1, $D2
+    vmovdqa $B0, $T1
+    vmovdqa $C0, $T2
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        dec $acc0
+    jne 1b
+    vpaddd .chacha20_consts(%rip), $A0, $A0
+    vpaddd .chacha20_consts(%rip), $A1, $A1
+    vpaddd .chacha20_consts(%rip), $A2, $A2
+    vpaddd $T1, $B0, $B0
+    vpaddd $T1, $B1, $B1
+    vpaddd $T1, $B2, $B2
+    vpaddd $T2, $C0, $C0
+    vpaddd $T2, $C1, $C1
+    vpaddd $T2, $C2, $C2
+    vpaddd $ctr0_store, $D0, $D0
+    vpaddd $ctr1_store, $D1, $D1
+    vpaddd $ctr2_store, $D2, $D2
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .clamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 320 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+    vperm2i128 \$0x02, $A2, $B2, $C1
+    vperm2i128 \$0x02, $C2, $D2, $D1
+    vperm2i128 \$0x13, $A2, $B2, $A2
+    vperm2i128 \$0x13, $C2, $D2, $B2
+    jmp open_avx2_short
+.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+###############################################################################
+###############################################################################
+.type chacha20_poly1305_seal_avx2,\@function,2
+.align 64
+chacha20_poly1305_seal_avx2:
+    vzeroupper
+    vmovdqa .chacha20_consts(%rip), $A0
+    vbroadcasti128 0*16($keyp), $B0
+    vbroadcasti128 1*16($keyp), $C0
+    vbroadcasti128 2*16($keyp), $D0
+    vpaddd .avx2_init(%rip), $D0, $D0
+    cmp \$6*32, $inl
+    jbe seal_avx2_192
+    cmp \$10*32, $inl
+    jbe seal_avx2_320
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $A0, $A3
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $B0, $B3
+    vmovdqa $B0, $state1_store
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vmovdqa $C0, $C3
+    vmovdqa $C0, $state2_store
+    vmovdqa $D0, $D3
+    vpaddd .avx2_inc(%rip), $D3, $D2
+    vpaddd .avx2_inc(%rip), $D2, $D1
+    vpaddd .avx2_inc(%rip), $D1, $D0
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+    vmovdqa $D3, $ctr3_store
+    mov \$10, $acc0
+1:  \n";
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        dec $acc0
+        jnz 1b\n";
+    &finalize_state_avx2(4); $code.="
+    vperm2i128 \$0x13, $C3, $D3, $C3
+    vperm2i128 \$0x02, $A3, $B3, $D3
+    vperm2i128 \$0x13, $A3, $B3, $A3
+    vpand .clamp(%rip), $D3, $D3
+    vmovdqa $D3, $r_store
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+    # Safely store 320 bytes (otherwise would handle with optimized call)
+    vpxor 0*32($inp), $A3, $A3
+    vpxor 1*32($inp), $C3, $C3
+    vmovdqu $A3, 0*32($oup)
+    vmovdqu $C3, 1*32($oup)\n";
+    &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
+    &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
+    lea 10*32($inp), $inp
+    sub \$10*32, $inl
+    mov \$10*32, $itr1
+    cmp \$4*32, $inl
+    jbe seal_avx2_hash
+    vpxor 0*32($inp), $A0, $A0
+    vpxor 1*32($inp), $B0, $B0
+    vpxor 2*32($inp), $C0, $C0
+    vpxor 3*32($inp), $D0, $D0
+    vmovdqu $A0, 10*32($oup)
+    vmovdqu $B0, 11*32($oup)
+    vmovdqu $C0, 12*32($oup)
+    vmovdqu $D0, 13*32($oup)
+    lea 4*32($inp), $inp
+    sub \$4*32, $inl
+    mov \$8, $itr1
+    mov \$2, $itr2
+    cmp \$4*32, $inl
+    jbe seal_avx2_tail_128
+    cmp \$8*32, $inl
+    jbe seal_avx2_tail_256
+    cmp \$12*32, $inl
+    jbe seal_avx2_tail_384
+    cmp \$16*32, $inl
+    jbe seal_avx2_tail_512\n";
+    # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
+    &prep_state_avx2(4);
+    foreach $l (@loop_body) {$code.=$l."\n";}
+    @loop_body = split /\n/, $chacha_body;
+    &emit_body(41);
+    @loop_body = split /\n/, $chacha_body; $code.="
+    sub \$16, $oup
+    mov \$9, $itr1
+    jmp 4f
+1:  \n";
+        &prep_state_avx2(4); $code.="
+        mov \$10, $itr1
+2:  \n";
+            &poly_add("0*8($oup)");
+            &emit_body(10);
+            &poly_stage1_mulx();
+            &emit_body(9);
+            &poly_stage2_mulx();
+            &emit_body(12);
+            &poly_stage3_mulx();
+            &emit_body(10);
+            &poly_reduce_stage(); $code.="
+4:  \n";
+            &emit_body(9);
+            &poly_add("2*8($oup)");
+            &emit_body(8);
+            &poly_stage1_mulx();
+            &emit_body(18);
+            &poly_stage2_mulx();
+            &emit_body(18);
+            &poly_stage3_mulx();
+            &emit_body(9);
+            &poly_reduce_stage();
+            &emit_body(8);
+            &poly_add("4*8($oup)"); $code.="
+            lea 6*8($oup), $oup\n";
+            &emit_body(18);
+            &poly_stage1_mulx();
+            &emit_body(8);
+            &poly_stage2_mulx();
+            &emit_body(8);
+            &poly_stage3_mulx();
+            &emit_body(18);
+            &poly_reduce_stage();
+            foreach $l (@loop_body) {$code.=$l."\n";}
+            @loop_body = split /\n/, $chacha_body; $code.="
+            dec $itr1
+        jne 2b\n";
+        &finalize_state_avx2(4); $code.="
+        lea 4*8($oup), $oup
+        vmovdqa $A0, $tmp_store\n";
+        &poly_add("-4*8($oup)");
+        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+        vmovdqa $tmp_store, $A0\n";
+        &poly_mul();
+        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+        &poly_add("-2*8($oup)");
+        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+        &poly_mul();
+        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
+        lea 16*32($inp), $inp
+        sub \$16*32, $inl
+        cmp \$16*32, $inl
+    jg 1b\n";
+    &poly_add("0*8($oup)");
+    &poly_mul();
+    &poly_add("2*8($oup)");
+    &poly_mul(); $code.="
+    lea 4*8($oup), $oup
+    mov \$10, $itr1
+    xor $itr2, $itr2
+    cmp \$4*32, $inl
+    ja 3f
+###############################################################################
+seal_avx2_tail_128:\n";
+    &prep_state_avx2(1); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 2*8($oup), $oup
+2:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &poly_add("2*8($oup)");
+        &poly_mul(); $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg 1b
+        dec $itr2
+    jge 2b\n";
+    &finalize_state_avx2(1);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    jmp seal_avx2_short_loop
+3:
+    cmp \$8*32, $inl
+    ja 3f
+###############################################################################
+seal_avx2_tail_256:\n";
+    &prep_state_avx2(2); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 2*8($oup), $oup
+2:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &poly_add("2*8($oup)");
+        &poly_mul(); $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg 1b
+        dec $itr2
+    jge 2b\n";
+    &finalize_state_avx2(2);
+    &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    mov \$4*32, $itr1
+    lea 4*32($inp), $inp
+    sub \$4*32, $inl
+    jmp seal_avx2_hash
+3:
+    cmp \$12*32, $inl
+    ja seal_avx2_tail_512
+###############################################################################
+seal_avx2_tail_384:\n";
+    &prep_state_avx2(3); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        lea 2*8($oup), $oup
+2:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &poly_add("2*8($oup)");
+        &poly_mul();
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg 1b
+        dec $itr2
+    jge 2b\n";
+    &finalize_state_avx2(3);
+    &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
+    &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    mov \$8*32, $itr1
+    lea 8*32($inp), $inp
+    sub \$8*32, $inl
+    jmp seal_avx2_hash
+###############################################################################
+seal_avx2_tail_512:\n";
+    &prep_state_avx2(4); $code.="
+1:  \n";
+        &poly_add("0($oup)");
+        &poly_mul_mulx(); $code.="
+        lea 2*8($oup), $oup
+2:  \n";
+        &emit_body(20);
+        &poly_add("0*8($oup)");
+        &emit_body(20);
+        &poly_stage1_mulx();
+        &emit_body(20);
+        &poly_stage2_mulx();
+        &emit_body(20);
+        &poly_stage3_mulx();
+        &emit_body(20);
+        &poly_reduce_stage();
+        &emit_body(20);
+        &poly_add("2*8($oup)");
+        &emit_body(20);
+        &poly_stage1_mulx();
+        &emit_body(20);
+        &poly_stage2_mulx();
+        &emit_body(20);
+        &poly_stage3_mulx();
+        &emit_body(20);
+        &poly_reduce_stage();
+        foreach $l (@loop_body) {$code.=$l."\n";}
+        @loop_body = split /\n/, $chacha_body; $code.="
+        lea 4*8($oup), $oup
+        dec $itr1
+    jg 1b
+        dec $itr2
+    jge 2b\n";
+    &finalize_state_avx2(4); $code.="
+    vmovdqa $A0, $tmp_store\n";
+    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
+    vmovdqa $tmp_store, $A0\n";
+    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
+    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
+    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
+    mov \$12*32, $itr1
+    lea 12*32($inp), $inp
+    sub \$12*32, $inl
+    jmp seal_avx2_hash
+################################################################################
+seal_avx2_320:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .avx2_inc(%rip), $D0, $D1
+    vpaddd .avx2_inc(%rip), $D1, $D2
+    vmovdqa $B0, $T1
+    vmovdqa $C0, $T2
+    vmovdqa $D0, $ctr0_store
+    vmovdqa $D1, $ctr1_store
+    vmovdqa $D2, $ctr2_store
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
+        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
+        dec $acc0
+    jne 1b
+    vpaddd .chacha20_consts(%rip), $A0, $A0
+    vpaddd .chacha20_consts(%rip), $A1, $A1
+    vpaddd .chacha20_consts(%rip), $A2, $A2
+    vpaddd $T1, $B0, $B0
+    vpaddd $T1, $B1, $B1
+    vpaddd $T1, $B2, $B2
+    vpaddd $T2, $C0, $C0
+    vpaddd $T2, $C1, $C1
+    vpaddd $T2, $C2, $C2
+    vpaddd $ctr0_store, $D0, $D0
+    vpaddd $ctr1_store, $D1, $D1
+    vpaddd $ctr2_store, $D2, $D2
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .clamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 320 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+    vperm2i128 \$0x02, $A2, $B2, $C1
+    vperm2i128 \$0x02, $C2, $D2, $D1
+    vperm2i128 \$0x13, $A2, $B2, $A2
+    vperm2i128 \$0x13, $C2, $D2, $B2
+    jmp seal_avx2_short
+################################################################################
+seal_avx2_192:
+    vmovdqa $A0, $A1
+    vmovdqa $A0, $A2
+    vmovdqa $B0, $B1
+    vmovdqa $B0, $B2
+    vmovdqa $C0, $C1
+    vmovdqa $C0, $C2
+    vpaddd .avx2_inc(%rip), $D0, $D1
+    vmovdqa $D0, $T2
+    vmovdqa $D1, $T3
+    mov \$10, $acc0
+1:  \n";
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
+        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
+        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
+        dec $acc0
+    jne 1b
+    vpaddd $A2, $A0, $A0
+    vpaddd $A2, $A1, $A1
+    vpaddd $B2, $B0, $B0
+    vpaddd $B2, $B1, $B1
+    vpaddd $C2, $C0, $C0
+    vpaddd $C2, $C1, $C1
+    vpaddd $T2, $D0, $D0
+    vpaddd $T3, $D1, $D1
+    vperm2i128 \$0x02, $A0, $B0, $T0
+    # Clamp and store the key
+    vpand .clamp(%rip), $T0, $T0
+    vmovdqa $T0, $r_store
+    # Stream for up to 192 bytes
+    vperm2i128 \$0x13, $A0, $B0, $A0
+    vperm2i128 \$0x13, $C0, $D0, $B0
+    vperm2i128 \$0x02, $A1, $B1, $C0
+    vperm2i128 \$0x02, $C1, $D1, $D0
+    vperm2i128 \$0x13, $A1, $B1, $A1
+    vperm2i128 \$0x13, $C1, $D1, $B1
+seal_avx2_short:
+    mov %r8, $itr2
+    call poly_hash_ad_internal
+    xor $itr1, $itr1
+seal_avx2_hash:
+        cmp \$16, $itr1
+        jb seal_avx2_short_loop\n";
+        &poly_add("0($oup)");
+        &poly_mul(); $code.="
+        sub \$16, $itr1
+        add \$16, $oup
+    jmp seal_avx2_hash
+seal_avx2_short_loop:
+        cmp \$32, $inl
+        jb seal_avx2_short_tail
+        sub \$32, $inl
+        # Encrypt
+        vpxor ($inp), $A0, $A0
+        vmovdqu $A0, ($oup)
+        lea 1*32($inp), $inp
+        # Load + hash\n";
+        &poly_add("0*8($oup)");
+        &poly_mul();
+        &poly_add("2*8($oup)");
+        &poly_mul(); $code.="
+        lea 1*32($oup), $oup
+        # Shift stream
+        vmovdqa $B0, $A0
+        vmovdqa $C0, $B0
+        vmovdqa $D0, $C0
+        vmovdqa $A1, $D0
+        vmovdqa $B1, $A1
+        vmovdqa $C1, $B1
+        vmovdqa $D1, $C1
+        vmovdqa $A2, $D1
+        vmovdqa $B2, $A2
+    jmp seal_avx2_short_loop
+seal_avx2_short_tail:
+    cmp \$16, $inl
+    jb 1f
+    sub \$16, $inl
+    vpxor ($inp), $A0x, $A3x
+    vmovdqu $A3x, ($oup)
+    lea 1*16($inp), $inp\n";
+    &poly_add("0*8($oup)");
+    &poly_mul(); $code.="
+    lea 1*16($oup), $oup
+    vextracti128 \$1, $A0, $A0x
+1:
+    vzeroupper
+    jmp seal_sse_tail_16
+.cfi_endproc
+";
+}
+
+if (!$win64) {
+  $code =~ s/\`([^\`]*)\`/eval $1/gem;
+  print $code;
+} else {
+  print <<___;
+.globl dummy_chacha20_poly1305_asm
+.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
+dummy_chacha20_poly1305_asm:
+    ret
+___
+}
+
+close STDOUT;
diff --git a/src/crypto/cipher/e_chacha20poly1305.c b/src/crypto/cipher/e_chacha20poly1305.c
index ed0d74c..34d094b 100644
--- a/src/crypto/cipher/e_chacha20poly1305.c
+++ b/src/crypto/cipher/e_chacha20poly1305.c
@@ -33,6 +33,42 @@
   unsigned char tag_len;
 };
 
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
+    !defined(OPENSSL_WINDOWS)
+static const int kHaveAsm = 1;
+// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It
+// decrypts |plaintext_len| bytes from |ciphertext| and writes them to
+// |out_plaintext|. On entry, |aead_data| must contain the final 48 bytes of
+// the initial ChaCha20 block, i.e. the key, followed by four zeros, followed
+// by the nonce. On exit, it will contain the calculated tag value, which the
+// caller must check.
+void chacha20_poly1305_open(uint8_t *out_plaintext, const uint8_t *ciphertext,
+                            size_t plaintext_len, const uint8_t *ad,
+                            size_t ad_len, uint8_t *aead_data);
+
+// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It
+// encrypts |plaintext_len| bytes from |plaintext| and writes them to
+// |out_ciphertext|. On entry, |aead_data| must contain the final 48 bytes of
+// the initial ChaCha20 block, i.e. the key, followed by four zeros, followed
+// by the nonce. On exit, it will contain the calculated tag value, which the
+// caller must append to the ciphertext.
+void chacha20_poly1305_seal(uint8_t *out_ciphertext, const uint8_t *plaintext,
+                            size_t plaintext_len, const uint8_t *ad,
+                            size_t ad_len, uint8_t *aead_data);
+#else
+static const int kHaveAsm = 0;
+
+static void chacha20_poly1305_open(uint8_t *out_plaintext,
+                                   const uint8_t *ciphertext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len, uint8_t *aead_data) {}
+
+static void chacha20_poly1305_seal(uint8_t *out_ciphertext,
+                                   const uint8_t *plaintext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len, uint8_t *aead_data) {}
+#endif
+
 static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
                                        size_t key_len, size_t tag_len) {
   struct aead_chacha20_poly1305_ctx *c20_ctx;
@@ -70,9 +106,8 @@
 
 static void poly1305_update_length(poly1305_state *poly1305, size_t data_len) {
   uint8_t length_bytes[8];
-  unsigned i;
 
-  for (i = 0; i < sizeof(length_bytes); i++) {
+  for (unsigned i = 0; i < sizeof(length_bytes); i++) {
     length_bytes[i] = data_len;
     data_len >>= 8;
   }
@@ -80,37 +115,49 @@
   CRYPTO_poly1305_update(poly1305, length_bytes, sizeof(length_bytes));
 }
 
-typedef void (*aead_poly1305_update)(poly1305_state *ctx, const uint8_t *ad,
-                                     size_t ad_len, const uint8_t *ciphertext,
-                                     size_t ciphertext_len);
+static void poly1305_update_padded_16(poly1305_state *poly1305,
+                                      const uint8_t *data, size_t data_len) {
+  static const uint8_t padding[16] = { 0 }; /* Padding is all zeros. */
 
-/* aead_poly1305 fills |tag| with the authentication tag for the given
- * inputs, using |update| to control the order and format that the inputs are
- * signed/authenticated. */
-static void aead_poly1305(aead_poly1305_update update,
-                          uint8_t tag[POLY1305_TAG_LEN],
-                          const struct aead_chacha20_poly1305_ctx *c20_ctx,
-                          const uint8_t nonce[12], const uint8_t *ad,
-                          size_t ad_len, const uint8_t *ciphertext,
-                          size_t ciphertext_len) {
+  CRYPTO_poly1305_update(poly1305, data, data_len);
+  if (data_len % 16 != 0) {
+    CRYPTO_poly1305_update(poly1305, padding,
+                           sizeof(padding) - (data_len % 16));
+  }
+}
+
+/* calc_tag fills |tag| with the authentication tag for the given inputs. */
+static void calc_tag(uint8_t tag[POLY1305_TAG_LEN],
+                     const struct aead_chacha20_poly1305_ctx *c20_ctx,
+                     const uint8_t nonce[12], const uint8_t *ad, size_t ad_len,
+                     const uint8_t *ciphertext, size_t ciphertext_len) {
   alignas(16) uint8_t poly1305_key[32];
   OPENSSL_memset(poly1305_key, 0, sizeof(poly1305_key));
   CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key),
                    c20_ctx->key, nonce, 0);
+
   poly1305_state ctx;
   CRYPTO_poly1305_init(&ctx, poly1305_key);
-  update(&ctx, ad, ad_len, ciphertext, ciphertext_len);
+  poly1305_update_padded_16(&ctx, ad, ad_len);
+  poly1305_update_padded_16(&ctx, ciphertext, ciphertext_len);
+  poly1305_update_length(&ctx, ad_len);
+  poly1305_update_length(&ctx, ciphertext_len);
   CRYPTO_poly1305_finish(&ctx, tag);
 }
 
-static int seal_impl(aead_poly1305_update poly1305_update,
-                     const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len,
-                     size_t max_out_len, const uint8_t nonce[12],
-                     const uint8_t *in, size_t in_len, const uint8_t *ad,
-                     size_t ad_len) {
+static int aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, uint8_t *out,
+                                       size_t *out_len, size_t max_out_len,
+                                       const uint8_t *nonce, size_t nonce_len,
+                                       const uint8_t *in, size_t in_len,
+                                       const uint8_t *ad, size_t ad_len) {
   const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state;
   const uint64_t in_len_64 = in_len;
 
+  if (nonce_len != 12) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
+    return 0;
+  }
+
   /* |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow
    * individual operations that work on more than 256GB at a time.
    * |in_len_64| is needed because, on 32-bit platforms, size_t is only
@@ -132,25 +179,37 @@
     return 0;
   }
 
-  CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
+  alignas(16) uint8_t tag[48];
 
-  alignas(16) uint8_t tag[POLY1305_TAG_LEN];
-  aead_poly1305(poly1305_update, tag, c20_ctx, nonce, ad, ad_len, out, in_len);
+  if (kHaveAsm) {
+    OPENSSL_memcpy(tag, c20_ctx->key, 32);
+    OPENSSL_memset(tag + 32, 0, 4);
+    OPENSSL_memcpy(tag + 32 + 4, nonce, 12);
+    chacha20_poly1305_seal(out, in, in_len, ad, ad_len, tag);
+  } else {
+    CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
+    calc_tag(tag, c20_ctx, nonce, ad, ad_len, out, in_len);
+  }
 
   OPENSSL_memcpy(out + in_len, tag, c20_ctx->tag_len);
   *out_len = in_len + c20_ctx->tag_len;
   return 1;
 }
 
-static int open_impl(aead_poly1305_update poly1305_update,
-                     const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len,
-                     size_t max_out_len, const uint8_t nonce[12],
-                     const uint8_t *in, size_t in_len, const uint8_t *ad,
-                     size_t ad_len) {
+static int aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, uint8_t *out,
+                                       size_t *out_len, size_t max_out_len,
+                                       const uint8_t *nonce, size_t nonce_len,
+                                       const uint8_t *in, size_t in_len,
+                                       const uint8_t *ad, size_t ad_len) {
   const struct aead_chacha20_poly1305_ctx *c20_ctx = ctx->aead_state;
   size_t plaintext_len;
   const uint64_t in_len_64 = in_len;
 
+  if (nonce_len != 12) {
+    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
+    return 0;
+  }
+
   if (in_len < c20_ctx->tag_len) {
     OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
     return 0;
@@ -168,64 +227,27 @@
   }
 
   plaintext_len = in_len - c20_ctx->tag_len;
-  alignas(16) uint8_t tag[POLY1305_TAG_LEN];
-  aead_poly1305(poly1305_update, tag, c20_ctx, nonce, ad, ad_len, in,
-                plaintext_len);
+  alignas(16) uint8_t tag[48];
+
+  if (kHaveAsm) {
+    OPENSSL_memcpy(tag, c20_ctx->key, 32);
+    OPENSSL_memset(tag + 32, 0, 4);
+    OPENSSL_memcpy(tag + 32 + 4, nonce, 12);
+    chacha20_poly1305_open(out, in, plaintext_len, ad, ad_len, tag);
+  } else {
+    calc_tag(tag, c20_ctx, nonce, ad, ad_len, in, plaintext_len);
+    CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
+  }
+
   if (CRYPTO_memcmp(tag, in + plaintext_len, c20_ctx->tag_len) != 0) {
     OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT);
     return 0;
   }
 
-  CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
   *out_len = plaintext_len;
   return 1;
 }
 
-static void poly1305_update_padded_16(poly1305_state *poly1305,
-                                      const uint8_t *data, size_t data_len) {
-  static const uint8_t padding[16] = { 0 }; /* Padding is all zeros. */
-
-  CRYPTO_poly1305_update(poly1305, data, data_len);
-  if (data_len % 16 != 0) {
-    CRYPTO_poly1305_update(poly1305, padding, sizeof(padding) - (data_len % 16));
-  }
-}
-
-static void poly1305_update(poly1305_state *ctx, const uint8_t *ad,
-                            size_t ad_len, const uint8_t *ciphertext,
-                            size_t ciphertext_len) {
-  poly1305_update_padded_16(ctx, ad, ad_len);
-  poly1305_update_padded_16(ctx, ciphertext, ciphertext_len);
-  poly1305_update_length(ctx, ad_len);
-  poly1305_update_length(ctx, ciphertext_len);
-}
-
-static int aead_chacha20_poly1305_seal(const EVP_AEAD_CTX *ctx, uint8_t *out,
-                                       size_t *out_len, size_t max_out_len,
-                                       const uint8_t *nonce, size_t nonce_len,
-                                       const uint8_t *in, size_t in_len,
-                                       const uint8_t *ad, size_t ad_len) {
-  if (nonce_len != 12) {
-    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
-    return 0;
-  }
-  return seal_impl(poly1305_update, ctx, out, out_len, max_out_len, nonce, in,
-                   in_len, ad, ad_len);
-}
-
-static int aead_chacha20_poly1305_open(const EVP_AEAD_CTX *ctx, uint8_t *out,
-                                       size_t *out_len, size_t max_out_len,
-                                       const uint8_t *nonce, size_t nonce_len,
-                                       const uint8_t *in, size_t in_len,
-                                       const uint8_t *ad, size_t ad_len) {
-  if (nonce_len != 12) {
-    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
-    return 0;
-  }
-  return open_impl(poly1305_update, ctx, out, out_len, max_out_len, nonce, in,
-                   in_len, ad, ad_len);
-}
-
 static const EVP_AEAD aead_chacha20_poly1305 = {
     32,                 /* key len */
     12,                 /* nonce len */
@@ -242,59 +264,3 @@
 const EVP_AEAD *EVP_aead_chacha20_poly1305(void) {
   return &aead_chacha20_poly1305;
 }
-
-static void poly1305_update_old(poly1305_state *ctx, const uint8_t *ad,
-                                size_t ad_len, const uint8_t *ciphertext,
-                                size_t ciphertext_len) {
-  CRYPTO_poly1305_update(ctx, ad, ad_len);
-  poly1305_update_length(ctx, ad_len);
-  CRYPTO_poly1305_update(ctx, ciphertext, ciphertext_len);
-  poly1305_update_length(ctx, ciphertext_len);
-}
-
-static int aead_chacha20_poly1305_old_seal(
-    const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, size_t max_out_len,
-    const uint8_t *nonce, size_t nonce_len, const uint8_t *in, size_t in_len,
-    const uint8_t *ad, size_t ad_len) {
-  if (nonce_len != 8) {
-    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
-    return 0;
-  }
-  uint8_t nonce_96[12];
-  OPENSSL_memset(nonce_96, 0, 4);
-  OPENSSL_memcpy(nonce_96 + 4, nonce, 8);
-  return seal_impl(poly1305_update_old, ctx, out, out_len, max_out_len,
-                   nonce_96, in, in_len, ad, ad_len);
-}
-
-static int aead_chacha20_poly1305_old_open(
-    const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, size_t max_out_len,
-    const uint8_t *nonce, size_t nonce_len, const uint8_t *in, size_t in_len,
-    const uint8_t *ad, size_t ad_len) {
-  if (nonce_len != 8) {
-    OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE);
-    return 0;
-  }
-  uint8_t nonce_96[12];
-  OPENSSL_memset(nonce_96, 0, 4);
-  OPENSSL_memcpy(nonce_96 + 4, nonce, 8);
-  return open_impl(poly1305_update_old, ctx, out, out_len, max_out_len,
-                   nonce_96, in, in_len, ad, ad_len);
-}
-
-static const EVP_AEAD aead_chacha20_poly1305_old = {
-    32,                 /* key len */
-    8,                  /* nonce len */
-    POLY1305_TAG_LEN,   /* overhead */
-    POLY1305_TAG_LEN,   /* max tag length */
-    aead_chacha20_poly1305_init,
-    NULL, /* init_with_direction */
-    aead_chacha20_poly1305_cleanup,
-    aead_chacha20_poly1305_old_seal,
-    aead_chacha20_poly1305_old_open,
-    NULL,               /* get_iv */
-};
-
-const EVP_AEAD *EVP_aead_chacha20_poly1305_old(void) {
-  return &aead_chacha20_poly1305_old;
-}
diff --git a/src/crypto/cipher/test/chacha20_poly1305_tests.txt b/src/crypto/cipher/test/chacha20_poly1305_tests.txt
index 103c196..018eb56 100644
--- a/src/crypto/cipher/test/chacha20_poly1305_tests.txt
+++ b/src/crypto/cipher/test/chacha20_poly1305_tests.txt
@@ -47,9 +47,6 @@
 CT: e275aeb341e1fc9a70c4fd4496fc7cdb
 TAG: 41acd0560ea6843d3e5d4e5babf6e946
 
-# Test vectors from chacha20_poly1305_old_tests.txt, modified for the RFC 7539
-# AEAD construction.
-
 KEY: 9a97f65b9b4c721b960a672145fca8d4e32e67f9111ea979ce9c4826806aeee6
 NONCE: 000000003de9c0da2bd7f91e
 IN: ""
diff --git a/src/crypto/dh/CMakeLists.txt b/src/crypto/dh/CMakeLists.txt
index f1e8616..83ae6d4 100644
--- a/src/crypto/dh/CMakeLists.txt
+++ b/src/crypto/dh/CMakeLists.txt
@@ -10,14 +10,3 @@
   check.c
   dh_asn1.c
 )
-
-add_executable(
-  dh_test
-
-  dh_test.cc
-
-  $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(dh_test crypto)
-add_dependencies(all_tests dh_test)
diff --git a/src/crypto/dh/dh_test.cc b/src/crypto/dh/dh_test.cc
index 8165c1a..9cde679 100644
--- a/src/crypto/dh/dh_test.cc
+++ b/src/crypto/dh/dh_test.cc
@@ -61,6 +61,8 @@
 
 #include <vector>
 
+#include <gtest/gtest.h>
+
 #include <openssl/bn.h>
 #include <openssl/bytestring.h>
 #include <openssl/crypto.h>
@@ -77,20 +79,16 @@
 static bool TestASN1();
 static bool TestRFC3526();
 
-int main() {
-  CRYPTO_library_init();
-
+// TODO(davidben): Convert this file to GTest properly.
+TEST(DHTest, AllTests) {
   if (!RunBasicTests() ||
       !RunRFC5114Tests() ||
       !TestBadY() ||
       !TestASN1() ||
       !TestRFC3526()) {
     ERR_print_errors_fp(stderr);
-    return 1;
+    ADD_FAILURE() << "Tests failed.";
   }
-
-  printf("PASS\n");
-  return 0;
 }
 
 static int GenerateCallback(int p, int n, BN_GENCB *arg) {
diff --git a/src/crypto/dsa/CMakeLists.txt b/src/crypto/dsa/CMakeLists.txt
index 4d66136..d3c12f5 100644
--- a/src/crypto/dsa/CMakeLists.txt
+++ b/src/crypto/dsa/CMakeLists.txt
@@ -8,14 +8,3 @@
   dsa.c
   dsa_asn1.c
 )
-
-add_executable(
-  dsa_test
-
-  dsa_test.cc
-
-  $<TARGET_OBJECTS:test_support>
-)
-
-target_link_libraries(dsa_test crypto)
-add_dependencies(all_tests dsa_test)
diff --git a/src/crypto/dsa/dsa_test.cc b/src/crypto/dsa/dsa_test.cc
index 5fee6aa..d2cd33e 100644
--- a/src/crypto/dsa/dsa_test.cc
+++ b/src/crypto/dsa/dsa_test.cc
@@ -62,6 +62,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#include <gtest/gtest.h>
+
 #include <openssl/bn.h>
 #include <openssl/crypto.h>
 #include <openssl/err.h>
@@ -302,9 +304,8 @@
   return true;
 }
 
-int main(int argc, char **argv) {
-  CRYPTO_library_init();
-
+// TODO(davidben): Convert this file to GTest properly.
+TEST(DSATest, AllTests) {
   if (!TestGenerate(stdout) ||
       !TestVerify(fips_sig, sizeof(fips_sig), 1) ||
       !TestVerify(fips_sig_negative, sizeof(fips_sig_negative), -1) ||
@@ -312,9 +313,6 @@
       !TestVerify(fips_sig_bad_length, sizeof(fips_sig_bad_length), -1) ||
       !TestVerify(fips_sig_bad_r, sizeof(fips_sig_bad_r), 0)) {
     ERR_print_errors_fp(stderr);
-    return 1;
+    ADD_FAILURE() << "Tests failed";
   }
-
-  printf("PASS\n");
-  return 0;
 }
diff --git a/src/crypto/poly1305/asm/poly1305-armv4.pl b/src/crypto/poly1305/asm/poly1305-armv4.pl
deleted file mode 100755
index 8d35e28..0000000
--- a/src/crypto/poly1305/asm/poly1305-armv4.pl
+++ /dev/null
@@ -1,1216 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-#			IALU(*)/gcc-4.4		NEON
-#
-# ARM11xx(ARMv6)	7.78/+100%		-
-# Cortex-A5		6.30/+130%		2.96
-# Cortex-A8		6.25/+115%		2.36
-# Cortex-A9		5.10/+95%		2.55
-# Cortex-A15		3.79/+85%		1.25(**)
-# Snapdragon S4		5.70/+100%		1.48(**)
-#
-# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
-# (**)	these are trade-off results, they can be improved by ~8% but at
-#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
-#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
-
-$flavour = shift;
-if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
-else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
-
-if ($flavour && $flavour ne "void") {
-    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-    die "can't locate arm-xlate.pl";
-
-    open STDOUT,"| \"$^X\" $xlate $flavour $output";
-} else {
-    open STDOUT,">$output";
-}
-
-($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
-
-$code.=<<___;
-#include <openssl/arm_arch.h>
-
-.text
-#if defined(__thumb2__)
-.syntax	unified
-.thumb
-#else
-.code	32
-#endif
-
-.globl	poly1305_emit
-.globl	poly1305_blocks
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-.Lpoly1305_init:
-	stmdb	sp!,{r4-r11}
-
-	eor	r3,r3,r3
-	cmp	$inp,#0
-	str	r3,[$ctx,#0]		@ zero hash value
-	str	r3,[$ctx,#4]
-	str	r3,[$ctx,#8]
-	str	r3,[$ctx,#12]
-	str	r3,[$ctx,#16]
-	str	r3,[$ctx,#36]		@ is_base2_26
-	add	$ctx,$ctx,#20
-
-#ifdef	__thumb2__
-	it	eq
-#endif
-	moveq	r0,#0
-	beq	.Lno_key
-
-#if	__ARM_MAX_ARCH__>=7
-	adr	r11,.Lpoly1305_init
-	ldr	r12,.LOPENSSL_armcap
-#endif
-	ldrb	r4,[$inp,#0]
-	mov	r10,#0x0fffffff
-	ldrb	r5,[$inp,#1]
-	and	r3,r10,#-4		@ 0x0ffffffc
-	ldrb	r6,[$inp,#2]
-	ldrb	r7,[$inp,#3]
-	orr	r4,r4,r5,lsl#8
-	ldrb	r5,[$inp,#4]
-	orr	r4,r4,r6,lsl#16
-	ldrb	r6,[$inp,#5]
-	orr	r4,r4,r7,lsl#24
-	ldrb	r7,[$inp,#6]
-	and	r4,r4,r10
-
-#if	__ARM_MAX_ARCH__>=7
-	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
-# ifdef	__APPLE__
-	ldr	r12,[r12]
-# endif
-#endif
-	ldrb	r8,[$inp,#7]
-	orr	r5,r5,r6,lsl#8
-	ldrb	r6,[$inp,#8]
-	orr	r5,r5,r7,lsl#16
-	ldrb	r7,[$inp,#9]
-	orr	r5,r5,r8,lsl#24
-	ldrb	r8,[$inp,#10]
-	and	r5,r5,r3
-
-#if	__ARM_MAX_ARCH__>=7
-	tst	r12,#ARMV7_NEON		@ check for NEON
-# ifdef	__APPLE__
-	adr	r9,poly1305_blocks_neon
-	adr	r11,poly1305_blocks
-#  ifdef __thumb2__
-	it	ne
-#  endif
-	movne	r11,r9
-	adr	r12,poly1305_emit
-	adr	r10,poly1305_emit_neon
-#  ifdef __thumb2__
-	it	ne
-#  endif
-	movne	r12,r10
-# else
-#  ifdef __thumb2__
-	itete	eq
-#  endif
-	addeq	r12,r11,#(poly1305_emit-.Lpoly1305_init)
-	addne	r12,r11,#(poly1305_emit_neon-.Lpoly1305_init)
-	addeq	r11,r11,#(poly1305_blocks-.Lpoly1305_init)
-	addne	r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init)
-# endif
-# ifdef	__thumb2__
-	orr	r12,r12,#1	@ thumb-ify address
-	orr	r11,r11,#1
-# endif
-#endif
-	ldrb	r9,[$inp,#11]
-	orr	r6,r6,r7,lsl#8
-	ldrb	r7,[$inp,#12]
-	orr	r6,r6,r8,lsl#16
-	ldrb	r8,[$inp,#13]
-	orr	r6,r6,r9,lsl#24
-	ldrb	r9,[$inp,#14]
-	and	r6,r6,r3
-
-	ldrb	r10,[$inp,#15]
-	orr	r7,r7,r8,lsl#8
-	str	r4,[$ctx,#0]
-	orr	r7,r7,r9,lsl#16
-	str	r5,[$ctx,#4]
-	orr	r7,r7,r10,lsl#24
-	str	r6,[$ctx,#8]
-	and	r7,r7,r3
-	str	r7,[$ctx,#12]
-#if	__ARM_MAX_ARCH__>=7
-	stmia	r2,{r11,r12}		@ fill functions table
-	mov	r0,#1
-#else
-	mov	r0,#0
-#endif
-.Lno_key:
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_init,.-poly1305_init
-___
-{
-my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
-my ($s1,$s2,$s3)=($r1,$r2,$r3);
-
-$code.=<<___;
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-	stmdb	sp!,{r3-r11,lr}
-
-	ands	$len,$len,#-16
-	beq	.Lno_data
-
-	cmp	$padbit,#0
-	add	$len,$len,$inp		@ end pointer
-	sub	sp,sp,#32
-
-	ldmia	$ctx,{$h0-$r3}		@ load context
-
-	str	$ctx,[sp,#12]		@ offload stuff
-	mov	lr,$inp
-	str	$len,[sp,#16]
-	str	$r1,[sp,#20]
-	str	$r2,[sp,#24]
-	str	$r3,[sp,#28]
-	b	.Loop
-
-.Loop:
-#if __ARM_ARCH__<7
-	ldrb	r0,[lr],#16		@ load input
-# ifdef	__thumb2__
-	it	hi
-# endif
-	addhi	$h4,$h4,#1		@ 1<<128
-	ldrb	r1,[lr,#-15]
-	ldrb	r2,[lr,#-14]
-	ldrb	r3,[lr,#-13]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-12]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-11]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-10]
-	adds	$h0,$h0,r3		@ accumulate input
-
-	ldrb	r3,[lr,#-9]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-8]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-7]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-6]
-	adcs	$h1,$h1,r3
-
-	ldrb	r3,[lr,#-5]
-	orr	r1,r0,r1,lsl#8
-	ldrb	r0,[lr,#-4]
-	orr	r2,r1,r2,lsl#16
-	ldrb	r1,[lr,#-3]
-	orr	r3,r2,r3,lsl#24
-	ldrb	r2,[lr,#-2]
-	adcs	$h2,$h2,r3
-
-	ldrb	r3,[lr,#-1]
-	orr	r1,r0,r1,lsl#8
-	str	lr,[sp,#8]		@ offload input pointer
-	orr	r2,r1,r2,lsl#16
-	add	$s1,$r1,$r1,lsr#2
-	orr	r3,r2,r3,lsl#24
-#else
-	ldr	r0,[lr],#16		@ load input
-# ifdef	__thumb2__
-	it	hi
-# endif
-	addhi	$h4,$h4,#1		@ padbit
-	ldr	r1,[lr,#-12]
-	ldr	r2,[lr,#-8]
-	ldr	r3,[lr,#-4]
-# ifdef	__ARMEB__
-	rev	r0,r0
-	rev	r1,r1
-	rev	r2,r2
-	rev	r3,r3
-# endif
-	adds	$h0,$h0,r0		@ accumulate input
-	str	lr,[sp,#8]		@ offload input pointer
-	adcs	$h1,$h1,r1
-	add	$s1,$r1,$r1,lsr#2
-	adcs	$h2,$h2,r2
-#endif
-	add	$s2,$r2,$r2,lsr#2
-	adcs	$h3,$h3,r3
-	add	$s3,$r3,$r3,lsr#2
-
-	umull	r2,r3,$h1,$r0
-	 adc	$h4,$h4,#0
-	umull	r0,r1,$h0,$r0
-	umlal	r2,r3,$h4,$s1
-	umlal	r0,r1,$h3,$s1
-	ldr	$r1,[sp,#20]		@ reload $r1
-	umlal	r2,r3,$h2,$s3
-	umlal	r0,r1,$h1,$s3
-	umlal	r2,r3,$h3,$s2
-	umlal	r0,r1,$h2,$s2
-	umlal	r2,r3,$h0,$r1
-	str	r0,[sp,#0]		@ future $h0
-	 mul	r0,$s2,$h4
-	ldr	$r2,[sp,#24]		@ reload $r2
-	adds	r2,r2,r1		@ d1+=d0>>32
-	 eor	r1,r1,r1
-	adc	lr,r3,#0		@ future $h2
-	str	r2,[sp,#4]		@ future $h1
-
-	mul	r2,$s3,$h4
-	eor	r3,r3,r3
-	umlal	r0,r1,$h3,$s3
-	ldr	$r3,[sp,#28]		@ reload $r3
-	umlal	r2,r3,$h3,$r0
-	umlal	r0,r1,$h2,$r0
-	umlal	r2,r3,$h2,$r1
-	umlal	r0,r1,$h1,$r1
-	umlal	r2,r3,$h1,$r2
-	umlal	r0,r1,$h0,$r2
-	umlal	r2,r3,$h0,$r3
-	ldr	$h0,[sp,#0]
-	mul	$h4,$r0,$h4
-	ldr	$h1,[sp,#4]
-
-	adds	$h2,lr,r0		@ d2+=d1>>32
-	ldr	lr,[sp,#8]		@ reload input pointer
-	adc	r1,r1,#0
-	adds	$h3,r2,r1		@ d3+=d2>>32
-	ldr	r0,[sp,#16]		@ reload end pointer
-	adc	r3,r3,#0
-	add	$h4,$h4,r3		@ h4+=d3>>32
-
-	and	r1,$h4,#-4
-	and	$h4,$h4,#3
-	add	r1,r1,r1,lsr#2		@ *=5
-	adds	$h0,$h0,r1
-	adcs	$h1,$h1,#0
-	adcs	$h2,$h2,#0
-	adc	$h3,$h3,#0
-
-	cmp	r0,lr			@ done yet?
-	bhi	.Loop
-
-	ldr	$ctx,[sp,#12]
-	add	sp,sp,#32
-	stmia	$ctx,{$h0-$h4}		@ store the result
-
-.Lno_data:
-#if	__ARM_ARCH__>=5
-	ldmia	sp!,{r3-r11,pc}
-#else
-	ldmia	sp!,{r3-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_blocks,.-poly1305_blocks
-___
-}
-{
-my ($ctx,$mac,$nonce)=map("r$_",(0..2));
-my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
-my $g4=$h4;
-
-$code.=<<___;
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-	stmdb	sp!,{r4-r11}
-.Lpoly1305_emit_enter:
-
-	ldmia	$ctx,{$h0-$h4}
-	adds	$g0,$h0,#5		@ compare to modulus
-	adcs	$g1,$h1,#0
-	adcs	$g2,$h2,#0
-	adcs	$g3,$h3,#0
-	adc	$g4,$h4,#0
-	tst	$g4,#4			@ did it carry/borrow?
-
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h0,$g0
-	ldr	$g0,[$nonce,#0]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h1,$g1
-	ldr	$g1,[$nonce,#4]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h2,$g2
-	ldr	$g2,[$nonce,#8]
-#ifdef	__thumb2__
-	it	ne
-#endif
-	movne	$h3,$g3
-	ldr	$g3,[$nonce,#12]
-
-	adds	$h0,$h0,$g0
-	adcs	$h1,$h1,$g1
-	adcs	$h2,$h2,$g2
-	adc	$h3,$h3,$g3
-
-#if __ARM_ARCH__>=7
-# ifdef __ARMEB__
-	rev	$h0,$h0
-	rev	$h1,$h1
-	rev	$h2,$h2
-	rev	$h3,$h3
-# endif
-	str	$h0,[$mac,#0]
-	str	$h1,[$mac,#4]
-	str	$h2,[$mac,#8]
-	str	$h3,[$mac,#12]
-#else
-	strb	$h0,[$mac,#0]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#4]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#8]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#12]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#1]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#5]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#9]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#13]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#2]
-	mov	$h0,$h0,lsr#8
-	strb	$h1,[$mac,#6]
-	mov	$h1,$h1,lsr#8
-	strb	$h2,[$mac,#10]
-	mov	$h2,$h2,lsr#8
-	strb	$h3,[$mac,#14]
-	mov	$h3,$h3,lsr#8
-
-	strb	$h0,[$mac,#3]
-	strb	$h1,[$mac,#7]
-	strb	$h2,[$mac,#11]
-	strb	$h3,[$mac,#15]
-#endif
-	ldmia	sp!,{r4-r11}
-#if	__ARM_ARCH__>=5
-	ret				@ bx	lr
-#else
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	poly1305_emit,.-poly1305_emit
-___
-{
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
-my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
-my ($T0,$T1,$MASK) = map("q$_",(15,4,0));
-
-my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
-
-$code.=<<___;
-#if	__ARM_MAX_ARCH__>=7
-.fpu	neon
-
-.type	poly1305_init_neon,%function
-.align	5
-poly1305_init_neon:
-	ldr	r4,[$ctx,#20]		@ load key base 2^32
-	ldr	r5,[$ctx,#24]
-	ldr	r6,[$ctx,#28]
-	ldr	r7,[$ctx,#32]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	and	r3,r3,#0x03ffffff
-	and	r4,r4,#0x03ffffff
-	and	r5,r5,#0x03ffffff
-
-	vdup.32	$R0,r2			@ r^1 in both lanes
-	add	r2,r3,r3,lsl#2		@ *5
-	vdup.32	$R1,r3
-	add	r3,r4,r4,lsl#2
-	vdup.32	$S1,r2
-	vdup.32	$R2,r4
-	add	r4,r5,r5,lsl#2
-	vdup.32	$S2,r3
-	vdup.32	$R3,r5
-	add	r5,r6,r6,lsl#2
-	vdup.32	$S3,r4
-	vdup.32	$R4,r6
-	vdup.32	$S4,r5
-
-	mov	$zeros,#2		@ counter
-
-.Lsquare_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-
-	vmull.u32	$D0,$R0,${R0}[1]
-	vmull.u32	$D1,$R1,${R0}[1]
-	vmull.u32	$D2,$R2,${R0}[1]
-	vmull.u32	$D3,$R3,${R0}[1]
-	vmull.u32	$D4,$R4,${R0}[1]
-
-	vmlal.u32	$D0,$R4,${S1}[1]
-	vmlal.u32	$D1,$R0,${R1}[1]
-	vmlal.u32	$D2,$R1,${R1}[1]
-	vmlal.u32	$D3,$R2,${R1}[1]
-	vmlal.u32	$D4,$R3,${R1}[1]
-
-	vmlal.u32	$D0,$R3,${S2}[1]
-	vmlal.u32	$D1,$R4,${S2}[1]
-	vmlal.u32	$D3,$R1,${R2}[1]
-	vmlal.u32	$D2,$R0,${R2}[1]
-	vmlal.u32	$D4,$R2,${R2}[1]
-
-	vmlal.u32	$D0,$R2,${S3}[1]
-	vmlal.u32	$D3,$R0,${R3}[1]
-	vmlal.u32	$D1,$R3,${S3}[1]
-	vmlal.u32	$D2,$R4,${S3}[1]
-	vmlal.u32	$D4,$R1,${R3}[1]
-
-	vmlal.u32	$D3,$R4,${S4}[1]
-	vmlal.u32	$D0,$R1,${S4}[1]
-	vmlal.u32	$D1,$R2,${S4}[1]
-	vmlal.u32	$D2,$R3,${S4}[1]
-	vmlal.u32	$D4,$R0,${R4}[1]
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	@ and P. Schwabe
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	vbic.i32	$D4#lo,#0xfc000000
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	 vbic.i32	$D2#lo,#0xfc000000
-
-	vshr.u32	$T0#lo,$D0#lo,#26
-	vbic.i32	$D0#lo,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-
-	subs		$zeros,$zeros,#1
-	beq		.Lsquare_break_neon
-
-	add		$tbl0,$ctx,#(48+0*9*4)
-	add		$tbl1,$ctx,#(48+1*9*4)
-
-	vtrn.32		$R0,$D0#lo		@ r^2:r^1
-	vtrn.32		$R2,$D2#lo
-	vtrn.32		$R3,$D3#lo
-	vtrn.32		$R1,$D1#lo
-	vtrn.32		$R4,$D4#lo
-
-	vshl.u32	$S2,$R2,#2		@ *5
-	vshl.u32	$S3,$R3,#2
-	vshl.u32	$S1,$R1,#2
-	vshl.u32	$S4,$R4,#2
-	vadd.i32	$S2,$S2,$R2
-	vadd.i32	$S1,$S1,$R1
-	vadd.i32	$S3,$S3,$R3
-	vadd.i32	$S4,$S4,$R4
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0,:32]
-	vst1.32		{${S4}[1]},[$tbl1,:32]
-
-	b		.Lsquare_neon
-
-.align	4
-.Lsquare_break_neon:
-	add		$tbl0,$ctx,#(48+2*4*9)
-	add		$tbl1,$ctx,#(48+3*4*9)
-
-	vmov		$R0,$D0#lo		@ r^4:r^3
-	vshl.u32	$S1,$D1#lo,#2		@ *5
-	vmov		$R1,$D1#lo
-	vshl.u32	$S2,$D2#lo,#2
-	vmov		$R2,$D2#lo
-	vshl.u32	$S3,$D3#lo,#2
-	vmov		$R3,$D3#lo
-	vshl.u32	$S4,$D4#lo,#2
-	vmov		$R4,$D4#lo
-	vadd.i32	$S1,$S1,$D1#lo
-	vadd.i32	$S2,$S2,$D2#lo
-	vadd.i32	$S3,$S3,$D3#lo
-	vadd.i32	$S4,$S4,$D4#lo
-
-	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
-	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
-	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vst1.32		{${S4}[0]},[$tbl0]
-	vst1.32		{${S4}[1]},[$tbl1]
-
-	ret				@ bx	lr
-.size	poly1305_init_neon,.-poly1305_init_neon
-
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-	ands	$len,$len,#-16
-	beq	.Lno_data_neon
-
-	cmp	$len,#64
-	bhs	.Lenter_neon
-	tst	ip,ip			@ is_base2_26?
-	beq	poly1305_blocks
-
-.Lenter_neon:
-	stmdb	sp!,{r4-r7}
-	vstmdb	sp!,{d8-d15}		@ ABI specification says so
-
-	tst	ip,ip			@ is_base2_26?
-	bne	.Lbase2_26_neon
-
-	stmdb	sp!,{r1-r3,lr}
-	bl	poly1305_init_neon
-
-	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
-	ldr	r5,[$ctx,#4]
-	ldr	r6,[$ctx,#8]
-	ldr	r7,[$ctx,#12]
-	ldr	ip,[$ctx,#16]
-
-	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
-	mov	r3,r4,lsr#26
-	 veor	$D0#lo,$D0#lo,$D0#lo
-	mov	r4,r5,lsr#20
-	orr	r3,r3,r5,lsl#6
-	 veor	$D1#lo,$D1#lo,$D1#lo
-	mov	r5,r6,lsr#14
-	orr	r4,r4,r6,lsl#12
-	 veor	$D2#lo,$D2#lo,$D2#lo
-	mov	r6,r7,lsr#8
-	orr	r5,r5,r7,lsl#18
-	 veor	$D3#lo,$D3#lo,$D3#lo
-	and	r3,r3,#0x03ffffff
-	orr	r6,r6,ip,lsl#24
-	 veor	$D4#lo,$D4#lo,$D4#lo
-	and	r4,r4,#0x03ffffff
-	mov	r1,#1
-	and	r5,r5,#0x03ffffff
-	str	r1,[$ctx,#36]		@ is_base2_26
-
-	vmov.32	$D0#lo[0],r2
-	vmov.32	$D1#lo[0],r3
-	vmov.32	$D2#lo[0],r4
-	vmov.32	$D3#lo[0],r5
-	vmov.32	$D4#lo[0],r6
-	adr	$zeros,.Lzeros
-
-	ldmia	sp!,{r1-r3,lr}
-	b	.Lbase2_32_neon
-
-.align	4
-.Lbase2_26_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ load hash value
-
-	veor		$D0#lo,$D0#lo,$D0#lo
-	veor		$D1#lo,$D1#lo,$D1#lo
-	veor		$D2#lo,$D2#lo,$D2#lo
-	veor		$D3#lo,$D3#lo,$D3#lo
-	veor		$D4#lo,$D4#lo,$D4#lo
-	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	adr		$zeros,.Lzeros
-	vld1.32		{$D4#lo[0]},[$ctx]
-	sub		$ctx,$ctx,#16		@ rewind
-
-.Lbase2_32_neon:
-	add		$in2,$inp,#32
-	mov		$padbit,$padbit,lsl#24
-	tst		$len,#31
-	beq		.Leven
-
-	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
-	vmov.32		$H4#lo[0],$padbit
-	sub		$len,$len,#16
-	add		$in2,$inp,#32
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
-	vshl.u32	$H3#lo,$H3#lo,#18
-
-	vsri.u32	$H3#lo,$H2#lo,#14
-	vshl.u32	$H2#lo,$H2#lo,#12
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
-
-	vbic.i32	$H3#lo,#0xfc000000
-	vsri.u32	$H2#lo,$H1#lo,#20
-	vshl.u32	$H1#lo,$H1#lo,#6
-
-	vbic.i32	$H2#lo,#0xfc000000
-	vsri.u32	$H1#lo,$H0#lo,#26
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-
-	vbic.i32	$H0#lo,#0xfc000000
-	vbic.i32	$H1#lo,#0xfc000000
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo
-
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-
-	mov		$tbl1,$zeros
-	add		$tbl0,$ctx,#48
-
-	cmp		$len,$len
-	b		.Long_tail
-
-.align	4
-.Leven:
-	subs		$len,$len,#64
-# ifdef	__thumb2__
-	it		lo
-# endif
-	movlo		$in2,$zeros
-
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-# ifdef	__thumb2__
-	itt		hi
-# endif
-	addhi		$tbl1,$ctx,#(48+1*9*4)
-	addhi		$tbl0,$ctx,#(48+3*9*4)
-
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H3,$H3
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-# endif
-	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	vshl.u32	$H3,$H3,#18
-
-	vsri.u32	$H3,$H2,#14
-	vshl.u32	$H2,$H2,#12
-
-	vbic.i32	$H3,#0xfc000000
-	vsri.u32	$H2,$H1,#20
-	vshl.u32	$H1,$H1,#6
-
-	vbic.i32	$H2,#0xfc000000
-	vsri.u32	$H1,$H0,#26
-
-	vbic.i32	$H0,#0xfc000000
-	vbic.i32	$H1,#0xfc000000
-
-	bls		.Lskip_loop
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	b		.Loop_neon
-
-.align	5
-.Loop_neon:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	@   \___________________/
-	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	@   \___________________/ \____________________/
-	@
-	@ Note that we start with inp[2:3]*r^2. This is because it
-	@ doesn't depend on reduction in previous iteration.
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ inp[2:3]*r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
-	vmull.u32	$D2,$H2#hi,${R0}[1]
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,${R0}[1]
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,${R0}[1]
-	vmlal.u32	$D2,$H1#hi,${R1}[1]
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,${R0}[1]
-
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,${R0}[1]
-	subs		$len,$len,#64
-	vmlal.u32	$D0,$H4#hi,${S1}[1]
-# ifdef	__thumb2__
-	it		lo
-# endif
-	movlo		$in2,$zeros
-	vmlal.u32	$D3,$H2#hi,${R1}[1]
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D1,$H0#hi,${R1}[1]
-	vmlal.u32	$D4,$H3#hi,${R1}[1]
-
-	vmlal.u32	$D0,$H3#hi,${S2}[1]
-	vmlal.u32	$D3,$H1#hi,${R2}[1]
-	vmlal.u32	$D4,$H2#hi,${R2}[1]
-	vmlal.u32	$D1,$H4#hi,${S2}[1]
-	vmlal.u32	$D2,$H0#hi,${R2}[1]
-
-	vmlal.u32	$D3,$H0#hi,${R3}[1]
-	vmlal.u32	$D0,$H2#hi,${S3}[1]
-	vmlal.u32	$D4,$H1#hi,${R3}[1]
-	vmlal.u32	$D1,$H3#hi,${S3}[1]
-	vmlal.u32	$D2,$H4#hi,${S3}[1]
-
-	vmlal.u32	$D3,$H4#hi,${S4}[1]
-	vmlal.u32	$D0,$H1#hi,${S4}[1]
-	vmlal.u32	$D4,$H0#hi,${R4}[1]
-	vmlal.u32	$D1,$H2#hi,${S4}[1]
-	vmlal.u32	$D2,$H3#hi,${S4}[1]
-
-	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
-	add		$in2,$in2,#64
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4 and accumulate
-
-	vmlal.u32	$D3,$H3#lo,${R0}[0]
-	vmlal.u32	$D0,$H0#lo,${R0}[0]
-	vmlal.u32	$D4,$H4#lo,${R0}[0]
-	vmlal.u32	$D1,$H1#lo,${R0}[0]
-	vmlal.u32	$D2,$H2#lo,${R0}[0]
-	vld1.32		${S4}[0],[$tbl0,:32]
-
-	vmlal.u32	$D3,$H2#lo,${R1}[0]
-	vmlal.u32	$D0,$H4#lo,${S1}[0]
-	vmlal.u32	$D4,$H3#lo,${R1}[0]
-	vmlal.u32	$D1,$H0#lo,${R1}[0]
-	vmlal.u32	$D2,$H1#lo,${R1}[0]
-
-	vmlal.u32	$D3,$H1#lo,${R2}[0]
-	vmlal.u32	$D0,$H3#lo,${S2}[0]
-	vmlal.u32	$D4,$H2#lo,${R2}[0]
-	vmlal.u32	$D1,$H4#lo,${S2}[0]
-	vmlal.u32	$D2,$H0#lo,${R2}[0]
-
-	vmlal.u32	$D3,$H0#lo,${R3}[0]
-	vmlal.u32	$D0,$H2#lo,${S3}[0]
-	vmlal.u32	$D4,$H1#lo,${R3}[0]
-	vmlal.u32	$D1,$H3#lo,${S3}[0]
-	vmlal.u32	$D3,$H4#lo,${S4}[0]
-
-	vmlal.u32	$D2,$H4#lo,${S3}[0]
-	vmlal.u32	$D0,$H1#lo,${S4}[0]
-	vmlal.u32	$D4,$H0#lo,${R4}[0]
-	vmov.i32	$H4,#1<<24		@ padbit, yes, always
-	vmlal.u32	$D1,$H2#lo,${S4}[0]
-	vmlal.u32	$D2,$H3#lo,${S4}[0]
-
-	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
-	add		$inp,$inp,#64
-# ifdef	__ARMEB__
-	vrev32.8	$H0,$H0
-	vrev32.8	$H1,$H1
-	vrev32.8	$H2,$H2
-	vrev32.8	$H3,$H3
-# endif
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction interleaved with base 2^32 -> base 2^26
-
-	vshr.u64	$T0,$D3,#26
-	vmovn.i64	$D3#lo,$D3
-	 vshr.u64	$T1,$D0,#26
-	 vmovn.i64	$D0#lo,$D0
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	vbic.i32	$D3#lo,#0xfc000000
-	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-	  vshl.u32	$H3,$H3,#18
-	 vbic.i32	$D0#lo,#0xfc000000
-
-	vshrn.u64	$T0#lo,$D4,#26
-	vmovn.i64	$D4#lo,$D4
-	 vshr.u64	$T1,$D1,#26
-	 vmovn.i64	$D1#lo,$D1
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-	  vsri.u32	$H3,$H2,#14
-	vbic.i32	$D4#lo,#0xfc000000
-	  vshl.u32	$H2,$H2,#12
-	 vbic.i32	$D1#lo,#0xfc000000
-
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo
-	vshl.u32	$T0#lo,$T0#lo,#2
-	  vbic.i32	$H3,#0xfc000000
-	 vshrn.u64	$T1#lo,$D2,#26
-	 vmovn.i64	$D2#lo,$D2
-	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
-	  vsri.u32	$H2,$H1,#20
-	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
-	  vshl.u32	$H1,$H1,#6
-	 vbic.i32	$D2#lo,#0xfc000000
-	  vbic.i32	$H2,#0xfc000000
-
-	vshr.u32	$T0#lo,$D0#lo,#26
-	vbic.i32	$D0#lo,#0xfc000000
-	  vsri.u32	$H1,$H0,#26
-	  vbic.i32	$H0,#0xfc000000
-	 vshr.u32	$T1#lo,$D3#lo,#26
-	 vbic.i32	$D3#lo,#0xfc000000
-	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
-	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
-	  vbic.i32	$H1,#0xfc000000
-
-	bhi		.Loop_neon
-
-.Lskip_loop:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	add		$tbl1,$ctx,#(48+0*9*4)
-	add		$tbl0,$ctx,#(48+1*9*4)
-	adds		$len,$len,#32
-# ifdef	__thumb2__
-	it		ne
-# endif
-	movne		$len,#0
-	bne		.Long_tail
-
-	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
-	vadd.i32	$H0#hi,$H0#lo,$D0#lo
-	vadd.i32	$H3#hi,$H3#lo,$D3#lo
-	vadd.i32	$H1#hi,$H1#lo,$D1#lo
-	vadd.i32	$H4#hi,$H4#lo,$D4#lo
-
-.Long_tail:
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
-
-	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
-	vmull.u32	$D2,$H2#hi,$R0
-	vadd.i32	$H0#lo,$H0#lo,$D0#lo
-	vmull.u32	$D0,$H0#hi,$R0
-	vadd.i32	$H3#lo,$H3#lo,$D3#lo
-	vmull.u32	$D3,$H3#hi,$R0
-	vadd.i32	$H1#lo,$H1#lo,$D1#lo
-	vmull.u32	$D1,$H1#hi,$R0
-	vadd.i32	$H4#lo,$H4#lo,$D4#lo
-	vmull.u32	$D4,$H4#hi,$R0
-
-	vmlal.u32	$D0,$H4#hi,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#hi,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#hi,$R1
-	vmlal.u32	$D4,$H3#hi,$R1
-	vmlal.u32	$D2,$H1#hi,$R1
-
-	vmlal.u32	$D3,$H1#hi,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#hi,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#hi,$R2
-	vmlal.u32	$D1,$H4#hi,$S2
-	vmlal.u32	$D2,$H0#hi,$R2
-
-	vmlal.u32	$D3,$H0#hi,$R3
-# ifdef	__thumb2__
-	it		ne
-# endif
-	 addne		$tbl1,$ctx,#(48+2*9*4)
-	vmlal.u32	$D0,$H2#hi,$S3
-# ifdef	__thumb2__
-	it		ne
-# endif
-	 addne		$tbl0,$ctx,#(48+3*9*4)
-	vmlal.u32	$D4,$H1#hi,$R3
-	vmlal.u32	$D1,$H3#hi,$S3
-	vmlal.u32	$D2,$H4#hi,$S3
-
-	vmlal.u32	$D3,$H4#hi,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
-	vmlal.u32	$D0,$H1#hi,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#hi,$R4
-	vmlal.u32	$D1,$H2#hi,$S4
-	vmlal.u32	$D2,$H3#hi,$S4
-
-	beq		.Lshort_tail
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
-	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
-
-	vmlal.u32	$D2,$H2#lo,$R0
-	vmlal.u32	$D0,$H0#lo,$R0
-	vmlal.u32	$D3,$H3#lo,$R0
-	vmlal.u32	$D1,$H1#lo,$R0
-	vmlal.u32	$D4,$H4#lo,$R0
-
-	vmlal.u32	$D0,$H4#lo,$S1
-	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
-	vmlal.u32	$D3,$H2#lo,$R1
-	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
-	vmlal.u32	$D1,$H0#lo,$R1
-	vmlal.u32	$D4,$H3#lo,$R1
-	vmlal.u32	$D2,$H1#lo,$R1
-
-	vmlal.u32	$D3,$H1#lo,$R2
-	vld1.32		${S4}[1],[$tbl1,:32]
-	vmlal.u32	$D0,$H3#lo,$S2
-	vld1.32		${S4}[0],[$tbl0,:32]
-	vmlal.u32	$D4,$H2#lo,$R2
-	vmlal.u32	$D1,$H4#lo,$S2
-	vmlal.u32	$D2,$H0#lo,$R2
-
-	vmlal.u32	$D3,$H0#lo,$R3
-	vmlal.u32	$D0,$H2#lo,$S3
-	vmlal.u32	$D4,$H1#lo,$R3
-	vmlal.u32	$D1,$H3#lo,$S3
-	vmlal.u32	$D2,$H4#lo,$S3
-
-	vmlal.u32	$D3,$H4#lo,$S4
-	 vorn		$MASK,$MASK,$MASK	@ all-ones
-	vmlal.u32	$D0,$H1#lo,$S4
-	 vshr.u64	$MASK,$MASK,#38
-	vmlal.u32	$D4,$H0#lo,$R4
-	vmlal.u32	$D1,$H2#lo,$S4
-	vmlal.u32	$D2,$H3#lo,$S4
-
-.Lshort_tail:
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ horizontal addition
-
-	vadd.i64	$D3#lo,$D3#lo,$D3#hi
-	vadd.i64	$D0#lo,$D0#lo,$D0#hi
-	vadd.i64	$D4#lo,$D4#lo,$D4#hi
-	vadd.i64	$D1#lo,$D1#lo,$D1#hi
-	vadd.i64	$D2#lo,$D2#lo,$D2#hi
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ lazy reduction, but without narrowing
-
-	vshr.u64	$T0,$D3,#26
-	vand.i64	$D3,$D3,$MASK
-	 vshr.u64	$T1,$D0,#26
-	 vand.i64	$D0,$D0,$MASK
-	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
-	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
-
-	vshr.u64	$T0,$D4,#26
-	vand.i64	$D4,$D4,$MASK
-	 vshr.u64	$T1,$D1,#26
-	 vand.i64	$D1,$D1,$MASK
-	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
-
-	vadd.i64	$D0,$D0,$T0
-	vshl.u64	$T0,$T0,#2
-	 vshr.u64	$T1,$D2,#26
-	 vand.i64	$D2,$D2,$MASK
-	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
-	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
-
-	vshr.u64	$T0,$D0,#26
-	vand.i64	$D0,$D0,$MASK
-	 vshr.u64	$T1,$D3,#26
-	 vand.i64	$D3,$D3,$MASK
-	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
-	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
-
-	cmp		$len,#0
-	bne		.Leven
-
-	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-	@ store hash value
-
-	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
-	vst1.32		{$D4#lo[0]},[$ctx]
-
-	vldmia	sp!,{d8-d15}			@ epilogue
-	ldmia	sp!,{r4-r7}
-.Lno_data_neon:
-	ret					@ bx	lr
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type	poly1305_emit_neon,%function
-.align	5
-poly1305_emit_neon:
-	ldr	ip,[$ctx,#36]		@ is_base2_26
-
-	stmdb	sp!,{r4-r11}
-
-	tst	ip,ip
-	beq	.Lpoly1305_emit_enter
-
-	ldmia	$ctx,{$h0-$h4}
-	eor	$g0,$g0,$g0
-
-	adds	$h0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
-	mov	$h1,$h1,lsr#6
-	adcs	$h1,$h1,$h2,lsl#20
-	mov	$h2,$h2,lsr#12
-	adcs	$h2,$h2,$h3,lsl#14
-	mov	$h3,$h3,lsr#18
-	adcs	$h3,$h3,$h4,lsl#8
-	adc	$h4,$g0,$h4,lsr#24	@ can be partially reduced ...
-
-	and	$g0,$h4,#-4		@ ... so reduce
-	and	$h4,$h3,#3
-	add	$g0,$g0,$g0,lsr#2	@ *= 5
-	adds	$h0,$h0,$g0
-	adcs	$h1,$h1,#0
-	adcs	$h2,$h2,#0
-	adc	$h3,$h3,#0
-
-	adds	$g0,$h0,#5		@ compare to modulus
-	adcs	$g1,$h1,#0
-	adcs	$g2,$h2,#0
-	adcs	$g3,$h3,#0
-	adc	$g4,$h4,#0
-	tst	$g4,#4			@ did it carry/borrow?
-
-# ifdef	__thumb2__
-	it	ne
-# endif
-	movne	$h0,$g0
-	ldr	$g0,[$nonce,#0]
-# ifdef	__thumb2__
-	it	ne
-# endif
-	movne	$h1,$g1
-	ldr	$g1,[$nonce,#4]
-# ifdef	__thumb2__
-	it	ne
-# endif
-	movne	$h2,$g2
-	ldr	$g2,[$nonce,#8]
-# ifdef	__thumb2__
-	it	ne
-# endif
-	movne	$h3,$g3
-	ldr	$g3,[$nonce,#12]
-
-	adds	$h0,$h0,$g0		@ accumulate nonce
-	adcs	$h1,$h1,$g1
-	adcs	$h2,$h2,$g2
-	adc	$h3,$h3,$g3
-
-# ifdef __ARMEB__
-	rev	$h0,$h0
-	rev	$h1,$h1
-	rev	$h2,$h2
-	rev	$h3,$h3
-# endif
-	str	$h0,[$mac,#0]		@ store the result
-	str	$h1,[$mac,#4]
-	str	$h2,[$mac,#8]
-	str	$h3,[$mac,#12]
-
-	ldmia	sp!,{r4-r11}
-	ret				@ bx	lr
-.size	poly1305_emit_neon,.-poly1305_emit_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-.LOPENSSL_armcap:
-.word	OPENSSL_armcap_P-.Lpoly1305_init
-#endif
-___
-}	}
-$code.=<<___;
-.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-#if	__ARM_MAX_ARCH__>=7
-.comm   OPENSSL_armcap_P,4,4
-#endif
-___
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/geo;
-
-	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
-	s/\bret\b/bx	lr/go						or
-	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
-
-	print $_,"\n";
-}
-close STDOUT; # enforce flush
diff --git a/src/crypto/poly1305/asm/poly1305-armv8.pl b/src/crypto/poly1305/asm/poly1305-armv8.pl
deleted file mode 100755
index 1d9a81b..0000000
--- a/src/crypto/poly1305/asm/poly1305-armv8.pl
+++ /dev/null
@@ -1,925 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for ARMv8.
-#
-# June 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone.
-#
-#		IALU/gcc-4.9	NEON
-#
-# Apple A7	1.86/+5%	0.72
-# Cortex-A53	2.63/+58%	1.47
-# Cortex-A57	2.70/+7%	1.14
-# Denver	1.39/+50%	1.18(*)
-# X-Gene	2.00/+68%	2.19
-#
-# (*)	estimate based on resources availability is less than 1.0,
-#	i.e. measured result is worse than expected, presumably binary
-#	translator is not almighty;
-
-$flavour=shift;
-$output=shift;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
-die "can't locate arm-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
-my ($mac,$nonce)=($inp,$len);
-
-my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
-
-$code.=<<___;
-#include <openssl/arm_arch.h>
-
-.text
-
-// forward "declarations" are required for Apple
-.extern	OPENSSL_armcap_P
-.globl	poly1305_blocks
-.globl	poly1305_emit
-
-.globl	poly1305_init
-.type	poly1305_init,%function
-.align	5
-poly1305_init:
-	cmp	$inp,xzr
-	stp	xzr,xzr,[$ctx]		// zero hash value
-	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
-
-	csel	x0,xzr,x0,eq
-	b.eq	.Lno_key
-
-#ifdef	__ILP32__
-	ldrsw	$t1,.LOPENSSL_armcap_P
-#else
-	ldr	$t1,.LOPENSSL_armcap_P
-#endif
-	adr	$t0,.LOPENSSL_armcap_P
-
-	ldp	$r0,$r1,[$inp]		// load key
-	mov	$s1,#0xfffffffc0fffffff
-	movk	$s1,#0x0fff,lsl#48
-	ldr	w17,[$t0,$t1]
-#ifdef	__ARMEB__
-	rev	$r0,$r0			// flip bytes
-	rev	$r1,$r1
-#endif
-	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
-	and	$s1,$s1,#-4
-	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
-	stp	$r0,$r1,[$ctx,#32]	// save key value
-
-	tst	w17,#ARMV7_NEON
-
-	adr	$d0,poly1305_blocks
-	adr	$r0,poly1305_blocks_neon
-	adr	$d1,poly1305_emit
-	adr	$r1,poly1305_emit_neon
-
-	csel	$d0,$d0,$r0,eq
-	csel	$d1,$d1,$r1,eq
-
-	stp	$d0,$d1,[$len]
-
-	mov	x0,#1
-.Lno_key:
-	ret
-.size	poly1305_init,.-poly1305_init
-
-.type	poly1305_blocks,%function
-.align	5
-poly1305_blocks:
-	ands	$len,$len,#-16
-	b.eq	.Lno_data
-
-	ldp	$h0,$h1,[$ctx]		// load hash value
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-	ldr	$h2,[$ctx,#16]
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	b	.Loop
-
-.align	5
-.Loop:
-	ldp	$t0,$t1,[$inp],#16	// load input
-	sub	$len,$len,#16
-#ifdef	__ARMEB__
-	rev	$t0,$t0
-	rev	$t1,$t1
-#endif
-	adds	$h0,$h0,$t0		// accumulate input
-	adcs	$h1,$h1,$t1
-
-	mul	$d0,$h0,$r0		// h0*r0
-	adc	$h2,$h2,$padbit
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adc	$h1,$d1,xzr
-
-	cbnz	$len,.Loop
-
-	stp	$h0,$h1,[$ctx]		// store hash value
-	str	$h2,[$ctx,#16]
-
-.Lno_data:
-	ret
-.size	poly1305_blocks,.-poly1305_blocks
-
-.type	poly1305_emit,%function
-.align	5
-poly1305_emit:
-	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
-	ldr	$h2,[$ctx,#16]
-	ldp	$t0,$t1,[$nonce]	// load nonce
-
-	adds	$d0,$h0,#5		// compare to modulus
-	adcs	$d1,$h1,xzr
-	adc	$d2,$h2,xzr
-
-	tst	$d2,#-4			// see if it's carried/borrowed
-
-	csel	$h0,$h0,$d0,eq
-	csel	$h1,$h1,$d1,eq
-
-#ifdef	__ARMEB__
-	ror	$t0,$t0,#32		// flip nonce words
-	ror	$t1,$t1,#32
-#endif
-	adds	$h0,$h0,$t0		// accumulate nonce
-	adc	$h1,$h1,$t1
-#ifdef	__ARMEB__
-	rev	$h0,$h0			// flip output bytes
-	rev	$h1,$h1
-#endif
-	stp	$h0,$h1,[$mac]		// write result
-
-	ret
-.size	poly1305_emit,.-poly1305_emit
-___
-my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
-my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
-my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
-my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
-my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
-my ($T0,$T1,$MASK) = map("v$_",(29..31));
-
-my ($in2,$zeros)=("x16","x17");
-my $is_base2_26 = $zeros;		# borrow
-
-$code.=<<___;
-.type	poly1305_mult,%function
-.align	5
-poly1305_mult:
-	mul	$d0,$h0,$r0		// h0*r0
-	umulh	$d1,$h0,$r0
-
-	mul	$t0,$h1,$s1		// h1*5*r1
-	umulh	$t1,$h1,$s1
-
-	adds	$d0,$d0,$t0
-	mul	$t0,$h0,$r1		// h0*r1
-	adc	$d1,$d1,$t1
-	umulh	$d2,$h0,$r1
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h1,$r0		// h1*r0
-	adc	$d2,$d2,xzr
-	umulh	$t1,$h1,$r0
-
-	adds	$d1,$d1,$t0
-	mul	$t0,$h2,$s1		// h2*5*r1
-	adc	$d2,$d2,$t1
-	mul	$t1,$h2,$r0		// h2*r0
-
-	adds	$d1,$d1,$t0
-	adc	$d2,$d2,$t1
-
-	and	$t0,$d2,#-4		// final reduction
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$d0,$t0
-	adc	$h1,$d1,xzr
-
-	ret
-.size	poly1305_mult,.-poly1305_mult
-
-.type	poly1305_splat,%function
-.align	5
-poly1305_splat:
-	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x13,$h0,#26,#26
-	extr	x14,$h1,$h0,#52
-	and	x14,x14,#0x03ffffff
-	ubfx	x15,$h1,#14,#26
-	extr	x16,$h2,$h1,#40
-
-	str	w12,[$ctx,#16*0]	// r0
-	add	w12,w13,w13,lsl#2	// r1*5
-	str	w13,[$ctx,#16*1]	// r1
-	add	w13,w14,w14,lsl#2	// r2*5
-	str	w12,[$ctx,#16*2]	// s1
-	str	w14,[$ctx,#16*3]	// r2
-	add	w14,w15,w15,lsl#2	// r3*5
-	str	w13,[$ctx,#16*4]	// s2
-	str	w15,[$ctx,#16*5]	// r3
-	add	w15,w16,w16,lsl#2	// r4*5
-	str	w14,[$ctx,#16*6]	// s3
-	str	w16,[$ctx,#16*7]	// r4
-	str	w15,[$ctx,#16*8]	// s4
-
-	ret
-.size	poly1305_splat,.-poly1305_splat
-
-.type	poly1305_blocks_neon,%function
-.align	5
-poly1305_blocks_neon:
-	ldr	$is_base2_26,[$ctx,#24]
-	cmp	$len,#128
-	b.hs	.Lblocks_neon
-	cbz	$is_base2_26,poly1305_blocks
-
-.Lblocks_neon:
-	stp	x29,x30,[sp,#-80]!
-	add	x29,sp,#0
-
-	ands	$len,$len,#-16
-	b.eq	.Lno_data_neon
-
-	cbz	$is_base2_26,.Lbase2_64_neon
-
-	ldp	w10,w11,[$ctx]		// load hash value base 2^26
-	ldp	w12,w13,[$ctx,#8]
-	ldr	w14,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Leven_neon
-
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	$h1,x12,#12
-	adds	$h0,$h0,x12,lsl#52
-	add	$h1,$h1,x13,lsl#14
-	adc	$h1,$h1,xzr
-	lsr	$h2,x14,#24
-	adds	$h1,$h1,x14,lsl#40
-	adc	$d2,$h2,xzr		// can be partially reduced...
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-
-	and	$t0,$d2,#-4		// ... so reduce
-	and	$h2,$d2,#3
-	add	$t0,$t0,$d2,lsr#2
-	adds	$h0,$h0,$t0
-	adc	$h1,$h1,xzr
-
-#ifdef	__ARMEB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-	ldr	x30,[sp,#8]
-
-	cbz	$padbit,.Lstore_base2_64_neon
-
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	cbnz	$len,.Leven_neon
-
-	stp	w10,w11,[$ctx]		// store hash value base 2^26
-	stp	w12,w13,[$ctx,#8]
-	str	w14,[$ctx,#16]
-	b	.Lno_data_neon
-
-.align	4
-.Lstore_base2_64_neon:
-	stp	$h0,$h1,[$ctx]		// store hash value base 2^64
-	stp	$h2,xzr,[$ctx,#16]	// note that is_base2_26 is zeroed
-	b	.Lno_data_neon
-
-.align	4
-.Lbase2_64_neon:
-	ldp	$r0,$r1,[$ctx,#32]	// load key value
-
-	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
-	ldr	$h2,[$ctx,#16]
-
-	tst	$len,#31
-	b.eq	.Linit_neon
-
-	ldp	$d0,$d1,[$inp],#16	// load input
-	sub	$len,$len,#16
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-#ifdef	__ARMEB__
-	rev	$d0,$d0
-	rev	$d1,$d1
-#endif
-	adds	$h0,$h0,$d0		// accumulate input
-	adcs	$h1,$h1,$d1
-	adc	$h2,$h2,$padbit
-
-	bl	poly1305_mult
-
-.Linit_neon:
-	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
-	ubfx	x11,$h0,#26,#26
-	extr	x12,$h1,$h0,#52
-	and	x12,x12,#0x03ffffff
-	ubfx	x13,$h1,#14,#26
-	extr	x14,$h2,$h1,#40
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-	////////////////////////////////// initialize r^n table
-	mov	$h0,$r0			// r^1
-	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
-	mov	$h1,$r1
-	mov	$h2,xzr
-	add	$ctx,$ctx,#48+12
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^2
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^3
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-
-	bl	poly1305_mult		// r^4
-	sub	$ctx,$ctx,#4
-	bl	poly1305_splat
-	ldr	x30,[sp,#8]
-
-	add	$in2,$inp,#32
-	adr	$zeros,.Lzeros
-	subs	$len,$len,#64
-	csel	$in2,$zeros,$in2,lo
-
-	mov	x4,#1
-	str	x4,[$ctx,#-24]		// set is_base2_26
-	sub	$ctx,$ctx,#48		// restore original $ctx
-	b	.Ldo_neon
-
-.align	4
-.Leven_neon:
-	add	$in2,$inp,#32
-	adr	$zeros,.Lzeros
-	subs	$len,$len,#64
-	csel	$in2,$zeros,$in2,lo
-
-	stp	d8,d9,[sp,#16]		// meet ABI requirements
-	stp	d10,d11,[sp,#32]
-	stp	d12,d13,[sp,#48]
-	stp	d14,d15,[sp,#64]
-
-	fmov	${H0},x10
-	fmov	${H1},x11
-	fmov	${H2},x12
-	fmov	${H3},x13
-	fmov	${H4},x14
-
-.Ldo_neon:
-	ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
-	ldp	x9,x13,[$in2],#48
-
-	lsl	$padbit,$padbit,#24
-	add	x15,$ctx,#48
-
-#ifdef	__ARMEB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN23_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN23_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	fmov	$IN23_2,x8
-	fmov	$IN23_3,x10
-	fmov	$IN23_4,x12
-
-	ldp	x8,x12,[$inp],#16	// inp[0:1]
-	ldp	x9,x13,[$inp],#48
-
-	ld1	{$R0,$R1,$S1,$R2},[x15],#64
-	ld1	{$S2,$R3,$S3,$R4},[x15],#64
-	ld1	{$S4},[x15]
-
-#ifdef	__ARMEB__
-	rev	x8,x8
-	rev	x12,x12
-	rev	x9,x9
-	rev	x13,x13
-#endif
-	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	and	x5,x9,#0x03ffffff
-	ubfx	x6,x8,#26,#26
-	ubfx	x7,x9,#26,#26
-	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	extr	x8,x12,x8,#52
-	extr	x9,x13,x9,#52
-	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	fmov	$IN01_0,x4
-	and	x8,x8,#0x03ffffff
-	and	x9,x9,#0x03ffffff
-	ubfx	x10,x12,#14,#26
-	ubfx	x11,x13,#14,#26
-	add	x12,$padbit,x12,lsr#40
-	add	x13,$padbit,x13,lsr#40
-	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	fmov	$IN01_1,x6
-	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	fmov	$IN01_2,x8
-	fmov	$IN01_3,x10
-	fmov	$IN01_4,x12
-
-	b.ls	.Lskip_loop
-
-.align	4
-.Loop_neon:
-	////////////////////////////////////////////////////////////////
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	//   \___________________/
-	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	//   \___________________/ \____________________/
-	//
-	// Note that we start with inp[2:3]*r^2. This is because it
-	// doesn't depend on reduction in previous iteration.
-	////////////////////////////////////////////////////////////////
-	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
-	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
-	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
-	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	subs	$len,$len,#64
-	umull	$ACC4,$IN23_0,${R4}[2]
-	csel	$in2,$zeros,$in2,lo
-	umull	$ACC3,$IN23_0,${R3}[2]
-	umull	$ACC2,$IN23_0,${R2}[2]
-	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
-	umull	$ACC1,$IN23_0,${R1}[2]
-	 ldp	x9,x13,[$in2],#48
-	umull	$ACC0,$IN23_0,${R0}[2]
-#ifdef	__ARMEB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	umlal	$ACC4,$IN23_1,${R3}[2]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC3,$IN23_1,${R2}[2]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC2,$IN23_1,${R1}[2]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN23_1,${R0}[2]
-	 ubfx	x7,x9,#26,#26
-	umlal	$ACC0,$IN23_1,${S4}[2]
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-
-	umlal	$ACC4,$IN23_2,${R2}[2]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC3,$IN23_2,${R1}[2]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC2,$IN23_2,${R0}[2]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC1,$IN23_2,${S4}[2]
-	 fmov	$IN23_0,x4
-	umlal	$ACC0,$IN23_2,${S3}[2]
-	 and	x8,x8,#0x03ffffff
-
-	umlal	$ACC4,$IN23_3,${R1}[2]
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN23_3,${R0}[2]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC2,$IN23_3,${S4}[2]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC1,$IN23_3,${S3}[2]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC0,$IN23_3,${S2}[2]
-	 fmov	$IN23_1,x6
-
-	add	$IN01_2,$IN01_2,$H2
-	 add	x12,$padbit,x12,lsr#40
-	umlal	$ACC4,$IN23_4,${R0}[2]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC3,$IN23_4,${S4}[2]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC2,$IN23_4,${S3}[2]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN23_4,${S2}[2]
-	 fmov	$IN23_2,x8
-	umlal	$ACC0,$IN23_4,${S1}[2]
-	 fmov	$IN23_3,x10
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	 fmov	$IN23_4,x12
-	umlal	$ACC3,$IN01_2,${R1}[0]
-	 ldp	x8,x12,[$inp],#16	// inp[0:1]
-	umlal	$ACC0,$IN01_2,${S3}[0]
-	 ldp	x9,x13,[$inp],#48
-	umlal	$ACC4,$IN01_2,${R2}[0]
-	umlal	$ACC1,$IN01_2,${S4}[0]
-	umlal	$ACC2,$IN01_2,${R0}[0]
-#ifdef	__ARMEB__
-	 rev	x8,x8
-	 rev	x12,x12
-	 rev	x9,x9
-	 rev	x13,x13
-#endif
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}[0]
-	umlal	$ACC4,$IN01_0,${R4}[0]
-	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
-	umlal	$ACC2,$IN01_0,${R2}[0]
-	 and	x5,x9,#0x03ffffff
-	umlal	$ACC0,$IN01_0,${R0}[0]
-	 ubfx	x6,x8,#26,#26
-	umlal	$ACC1,$IN01_0,${R1}[0]
-	 ubfx	x7,x9,#26,#26
-
-	add	$IN01_3,$IN01_3,$H3
-	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
-	umlal	$ACC3,$IN01_1,${R2}[0]
-	 extr	x8,x12,x8,#52
-	umlal	$ACC4,$IN01_1,${R3}[0]
-	 extr	x9,x13,x9,#52
-	umlal	$ACC0,$IN01_1,${S4}[0]
-	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
-	umlal	$ACC2,$IN01_1,${R1}[0]
-	 fmov	$IN01_0,x4
-	umlal	$ACC1,$IN01_1,${R0}[0]
-	 and	x8,x8,#0x03ffffff
-
-	add	$IN01_4,$IN01_4,$H4
-	 and	x9,x9,#0x03ffffff
-	umlal	$ACC3,$IN01_3,${R0}[0]
-	 ubfx	x10,x12,#14,#26
-	umlal	$ACC0,$IN01_3,${S2}[0]
-	 ubfx	x11,x13,#14,#26
-	umlal	$ACC4,$IN01_3,${R1}[0]
-	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
-	umlal	$ACC1,$IN01_3,${S3}[0]
-	 fmov	$IN01_1,x6
-	umlal	$ACC2,$IN01_3,${S4}[0]
-	 add	x12,$padbit,x12,lsr#40
-
-	umlal	$ACC3,$IN01_4,${S4}[0]
-	 add	x13,$padbit,x13,lsr#40
-	umlal	$ACC0,$IN01_4,${S1}[0]
-	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
-	umlal	$ACC4,$IN01_4,${R0}[0]
-	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
-	umlal	$ACC1,$IN01_4,${S2}[0]
-	 fmov	$IN01_2,x8
-	umlal	$ACC2,$IN01_4,${S3}[0]
-	 fmov	$IN01_3,x10
-
-	/////////////////////////////////////////////////////////////////
-	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-        // and P. Schwabe
-
-	ushr	$T0.2d,$ACC3,#26
-	 fmov	$IN01_4,x12
-	xtn	$H3,$ACC3
-	 ushr	$T1.2d,$ACC0,#26
-	 xtn	$H0,$ACC0
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-	 bic	$H0,#0xfc,lsl#24
-
-	shrn	$T0.2s,$ACC4,#26
-	xtn	$H4,$ACC4
-	 ushr	$T1.2d,$ACC1,#26
-	 xtn	$H1,$ACC1
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-	bic	$H4,#0xfc,lsl#24
-	 bic	$H1,#0xfc,lsl#24
-
-	add	$H0,$H0,$T0.2s
-	shl	$T0.2s,$T0.2s,#2
-	 shrn	$T1.2s,$ACC2,#26
-	 xtn	$H2,$ACC2
-	add	$H0,$H0,$T0.2s		// h4 -> h0
-	 add	$H3,$H3,$T1.2s		// h2 -> h3
-	 bic	$H2,#0xfc,lsl#24
-
-	ushr	$T0.2s,$H0,#26
-	bic	$H0,#0xfc,lsl#24
-	 ushr	$T1.2s,$H3,#26
-	 bic	$H3,#0xfc,lsl#24
-	add	$H1,$H1,$T0.2s		// h0 -> h1
-	 add	$H4,$H4,$T1.2s		// h3 -> h4
-
-	b.hi	.Loop_neon
-
-.Lskip_loop:
-	dup	$IN23_2,${IN23_2}[0]
-	movi	$MASK.2d,#-1
-	add	$IN01_2,$IN01_2,$H2
-	ushr	$MASK.2d,$MASK.2d,#38
-
-	////////////////////////////////////////////////////////////////
-	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	adds	$len,$len,#32
-	b.ne	.Long_tail
-
-	dup	$IN23_2,${IN01_2}[0]
-	add	$IN23_0,$IN01_0,$H0
-	add	$IN23_3,$IN01_3,$H3
-	add	$IN23_1,$IN01_1,$H1
-	add	$IN23_4,$IN01_4,$H4
-
-.Long_tail:
-	dup	$IN23_0,${IN23_0}[0]
-	umull2	$ACC0,$IN23_2,${S3}
-	umull2	$ACC3,$IN23_2,${R1}
-	umull2	$ACC4,$IN23_2,${R2}
-	umull2	$ACC2,$IN23_2,${R0}
-	umull2	$ACC1,$IN23_2,${S4}
-
-	dup	$IN23_1,${IN23_1}[0]
-	umlal2	$ACC0,$IN23_0,${R0}
-	umlal2	$ACC2,$IN23_0,${R2}
-	umlal2	$ACC3,$IN23_0,${R3}
-	umlal2	$ACC4,$IN23_0,${R4}
-	umlal2	$ACC1,$IN23_0,${R1}
-
-	dup	$IN23_3,${IN23_3}[0]
-	umlal2	$ACC0,$IN23_1,${S4}
-	umlal2	$ACC3,$IN23_1,${R2}
-	umlal2	$ACC2,$IN23_1,${R1}
-	umlal2	$ACC4,$IN23_1,${R3}
-	umlal2	$ACC1,$IN23_1,${R0}
-
-	dup	$IN23_4,${IN23_4}[0]
-	umlal2	$ACC3,$IN23_3,${R0}
-	umlal2	$ACC4,$IN23_3,${R1}
-	umlal2	$ACC0,$IN23_3,${S2}
-	umlal2	$ACC1,$IN23_3,${S3}
-	umlal2	$ACC2,$IN23_3,${S4}
-
-	umlal2	$ACC3,$IN23_4,${S4}
-	umlal2	$ACC0,$IN23_4,${S1}
-	umlal2	$ACC4,$IN23_4,${R0}
-	umlal2	$ACC1,$IN23_4,${S2}
-	umlal2	$ACC2,$IN23_4,${S3}
-
-	b.eq	.Lshort_tail
-
-	////////////////////////////////////////////////////////////////
-	// (hash+inp[0:1])*r^4:r^3 and accumulate
-
-	add	$IN01_0,$IN01_0,$H0
-	umlal	$ACC3,$IN01_2,${R1}
-	umlal	$ACC0,$IN01_2,${S3}
-	umlal	$ACC4,$IN01_2,${R2}
-	umlal	$ACC1,$IN01_2,${S4}
-	umlal	$ACC2,$IN01_2,${R0}
-
-	add	$IN01_1,$IN01_1,$H1
-	umlal	$ACC3,$IN01_0,${R3}
-	umlal	$ACC0,$IN01_0,${R0}
-	umlal	$ACC4,$IN01_0,${R4}
-	umlal	$ACC1,$IN01_0,${R1}
-	umlal	$ACC2,$IN01_0,${R2}
-
-	add	$IN01_3,$IN01_3,$H3
-	umlal	$ACC3,$IN01_1,${R2}
-	umlal	$ACC0,$IN01_1,${S4}
-	umlal	$ACC4,$IN01_1,${R3}
-	umlal	$ACC1,$IN01_1,${R0}
-	umlal	$ACC2,$IN01_1,${R1}
-
-	add	$IN01_4,$IN01_4,$H4
-	umlal	$ACC3,$IN01_3,${R0}
-	umlal	$ACC0,$IN01_3,${S2}
-	umlal	$ACC4,$IN01_3,${R1}
-	umlal	$ACC1,$IN01_3,${S3}
-	umlal	$ACC2,$IN01_3,${S4}
-
-	umlal	$ACC3,$IN01_4,${S4}
-	umlal	$ACC0,$IN01_4,${S1}
-	umlal	$ACC4,$IN01_4,${R0}
-	umlal	$ACC1,$IN01_4,${S2}
-	umlal	$ACC2,$IN01_4,${S3}
-
-.Lshort_tail:
-	////////////////////////////////////////////////////////////////
-	// horizontal add
-
-	addp	$ACC3,$ACC3,$ACC3
-	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
-	addp	$ACC0,$ACC0,$ACC0
-	 ldp	d10,d11,[sp,#32]
-	addp	$ACC4,$ACC4,$ACC4
-	 ldp	d12,d13,[sp,#48]
-	addp	$ACC1,$ACC1,$ACC1
-	 ldp	d14,d15,[sp,#64]
-	addp	$ACC2,$ACC2,$ACC2
-
-	////////////////////////////////////////////////////////////////
-	// lazy reduction, but without narrowing
-
-	ushr	$T0.2d,$ACC3,#26
-	and	$ACC3,$ACC3,$MASK.2d
-	 ushr	$T1.2d,$ACC0,#26
-	 and	$ACC0,$ACC0,$MASK.2d
-
-	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
-	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
-
-	ushr	$T0.2d,$ACC4,#26
-	and	$ACC4,$ACC4,$MASK.2d
-	 ushr	$T1.2d,$ACC1,#26
-	 and	$ACC1,$ACC1,$MASK.2d
-	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
-
-	add	$ACC0,$ACC0,$T0.2d
-	shl	$T0.2d,$T0.2d,#2
-	 ushr	$T1.2d,$ACC2,#26
-	 and	$ACC2,$ACC2,$MASK.2d
-	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
-	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
-
-	ushr	$T0.2d,$ACC0,#26
-	and	$ACC0,$ACC0,$MASK.2d
-	 ushr	$T1.2d,$ACC3,#26
-	 and	$ACC3,$ACC3,$MASK.2d
-	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
-	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
-
-	////////////////////////////////////////////////////////////////
-	// write the result, can be partially reduced
-
-	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
-	st1	{$ACC4}[0],[$ctx]
-
-.Lno_data_neon:
-	ldr	x29,[sp],#80
-	ret
-.size	poly1305_blocks_neon,.-poly1305_blocks_neon
-
-.type	poly1305_emit_neon,%function
-.align	5
-poly1305_emit_neon:
-	ldr	$is_base2_26,[$ctx,#24]
-	cbz	$is_base2_26,poly1305_emit
-
-	ldp	w10,w11,[$ctx]		// load hash value base 2^26
-	ldp	w12,w13,[$ctx,#8]
-	ldr	w14,[$ctx,#16]
-
-	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
-	lsr	$h1,x12,#12
-	adds	$h0,$h0,x12,lsl#52
-	add	$h1,$h1,x13,lsl#14
-	adc	$h1,$h1,xzr
-	lsr	$h2,x14,#24
-	adds	$h1,$h1,x14,lsl#40
-	adc	$h2,$h2,xzr		// can be partially reduced...
-
-	ldp	$t0,$t1,[$nonce]	// load nonce
-
-	and	$d0,$h2,#-4		// ... so reduce
-	add	$d0,$d0,$h2,lsr#2
-	and	$h2,$h2,#3
-	adds	$h0,$h0,$d0
-	adc	$h1,$h1,xzr
-
-	adds	$d0,$h0,#5		// compare to modulus
-	adcs	$d1,$h1,xzr
-	adc	$d2,$h2,xzr
-
-	tst	$d2,#-4			// see if it's carried/borrowed
-
-	csel	$h0,$h0,$d0,eq
-	csel	$h1,$h1,$d1,eq
-
-#ifdef	__ARMEB__
-	ror	$t0,$t0,#32		// flip nonce words
-	ror	$t1,$t1,#32
-#endif
-	adds	$h0,$h0,$t0		// accumulate nonce
-	adc	$h1,$h1,$t1
-#ifdef	__ARMEB__
-	rev	$h0,$h0			// flip output bytes
-	rev	$h1,$h1
-#endif
-	stp	$h0,$h1,[$mac]		// write result
-
-	ret
-.size	poly1305_emit_neon,.-poly1305_emit_neon
-
-.align	5
-.Lzeros:
-.long	0,0,0,0,0,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef	__ILP32__
-.long	OPENSSL_armcap_P-.
-#else
-.quad	OPENSSL_armcap_P-.
-#endif
-.asciz	"Poly1305 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
-.align	2
-___
-
-foreach (split("\n",$code)) {
-	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
-	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
-	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
-	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
-	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
-	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
-	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
-
-	s/\.[124]([sd])\[/.$1\[/;
-
-	print $_,"\n";
-}
-close STDOUT;
diff --git a/src/crypto/poly1305/asm/poly1305-x86.pl b/src/crypto/poly1305/asm/poly1305-x86.pl
deleted file mode 100755
index 6843995..0000000
--- a/src/crypto/poly1305/asm/poly1305-x86.pl
+++ /dev/null
@@ -1,1793 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86.
-#
-# April 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-#		IALU/gcc-3.4(*)	SSE2(**)	AVX2
-# Pentium	15.7/+80%	-
-# PIII		6.21/+90%	-
-# P4		19.8/+40%	3.24
-# Core 2	4.85/+90%	1.80
-# Westmere	4.58/+100%	1.43
-# Sandy Bridge	3.90/+100%	1.36
-# Haswell	3.88/+70%	1.18		0.72
-# Silvermont	11.0/+40%	4.80
-# VIA Nano	6.71/+90%	2.47
-# Sledgehammer	3.51/+180%	4.27
-# Bulldozer	4.53/+140%	1.31
-#
-# (*)	gcc 4.8 for some reason generated worse code;
-# (**)	besides SSE2 there are floating-point and AVX options; FP
-#	is deemed unnecessary, because pre-SSE2 processor are too
-#	old to care about, while it's not the fastest option on
-#	SSE2-capable ones; AVX is omitted, because it doesn't give
-#	a lot of improvement, 5-10% depending on processor;
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-$output=pop;
-open STDOUT,">$output";
-
-&asm_init($ARGV[0],"poly1305-x86.pl",$ARGV[$#ARGV] eq "386");
-
-$sse2=$avx=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-if ($sse2) {
-	&static_label("const_sse2");
-	&static_label("enter_blocks");
-	&static_label("enter_emit");
-	&external_label("OPENSSL_ia32cap_P");
-
-	# This may be set to 2, but valgrind can't do AVX2 on 32-bit. Without a
-	# way to verify test coverage, keep it disabled.
-	$avx = 0;
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int32 h[5];		# current hash value base 2^32
-#	unsigned __int32 pad;		# is_base2_26 in vector context
-#	unsigned __int32 r[4];		# key value base 2^32
-
-&align(64);
-&function_begin("poly1305_init");
-	&mov	("edi",&wparam(0));		# context
-	&mov	("esi",&wparam(1));		# key
-	&mov	("ebp",&wparam(2));		# function table
-
-	&xor	("eax","eax");
-	&mov	(&DWP(4*0,"edi"),"eax");	# zero hash value
-	&mov	(&DWP(4*1,"edi"),"eax");
-	&mov	(&DWP(4*2,"edi"),"eax");
-	&mov	(&DWP(4*3,"edi"),"eax");
-	&mov	(&DWP(4*4,"edi"),"eax");
-	&mov	(&DWP(4*5,"edi"),"eax");	# is_base2_26
-
-	&cmp	("esi",0);
-	&je	(&label("nokey"));
-
-    if ($sse2) {
-	&call	(&label("pic_point"));
-    &set_label("pic_point");
-	&blindpop("ebx");
-
-	&lea	("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx"));
-	&lea	("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx"));
-
-	&picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point"));
-	&mov	("ecx",&DWP(0,"edi"));
-	&and	("ecx",1<<26|1<<24);
-	&cmp	("ecx",1<<26|1<<24);		# SSE2 and XMM?
-	&jne	(&label("no_sse2"));
-
-	&lea	("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx"));
-	&lea	("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx"));
-
-      if ($avx>1) {
-	&mov	("ecx",&DWP(8,"edi"));
-	&test	("ecx",1<<5);			# AVX2?
-	&jz	(&label("no_sse2"));
-
-	&lea	("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx"));
-      }
-    &set_label("no_sse2");
-	&mov	("edi",&wparam(0));		# reload context
-	&mov	(&DWP(0,"ebp"),"eax");		# fill function table
-	&mov	(&DWP(4,"ebp"),"edx");
-    }
-
-	&mov	("eax",&DWP(4*0,"esi"));	# load input key
-	&mov	("ebx",&DWP(4*1,"esi"));
-	&mov	("ecx",&DWP(4*2,"esi"));
-	&mov	("edx",&DWP(4*3,"esi"));
-	&and	("eax",0x0fffffff);
-	&and	("ebx",0x0ffffffc);
-	&and	("ecx",0x0ffffffc);
-	&and	("edx",0x0ffffffc);
-	&mov	(&DWP(4*6,"edi"),"eax");
-	&mov	(&DWP(4*7,"edi"),"ebx");
-	&mov	(&DWP(4*8,"edi"),"ecx");
-	&mov	(&DWP(4*9,"edi"),"edx");
-
-	&mov	("eax",$sse2);
-&set_label("nokey");
-&function_end("poly1305_init");
-
-($h0,$h1,$h2,$h3,$h4,
- $d0,$d1,$d2,$d3,
- $r0,$r1,$r2,$r3,
-     $s1,$s2,$s3)=map(4*$_,(0..15));
-
-&function_begin("poly1305_blocks");
-	&mov	("edi",&wparam(0));		# ctx
-	&mov	("esi",&wparam(1));		# inp
-	&mov	("ecx",&wparam(2));		# len
-&set_label("enter_blocks");
-	&and	("ecx",-15);
-	&jz	(&label("nodata"));
-
-	&stack_push(16);
-	&mov	("eax",&DWP(4*6,"edi"));	# r0
-	&mov	("ebx",&DWP(4*7,"edi"));	# r1
-	 &lea	("ebp",&DWP(0,"esi","ecx"));	# end of input
-	&mov	("ecx",&DWP(4*8,"edi"));	# r2
-	&mov	("edx",&DWP(4*9,"edi"));	# r3
-
-	&mov	(&wparam(2),"ebp");
-	&mov	("ebp","esi");
-
-	&mov	(&DWP($r0,"esp"),"eax");	# r0
-	&mov	("eax","ebx");
-	&shr	("eax",2);
-	&mov	(&DWP($r1,"esp"),"ebx");	# r1
-	&add	("eax","ebx");			# s1
-	&mov	("ebx","ecx");
-	&shr	("ebx",2);
-	&mov	(&DWP($r2,"esp"),"ecx");	# r2
-	&add	("ebx","ecx");			# s2
-	&mov	("ecx","edx");
-	&shr	("ecx",2);
-	&mov	(&DWP($r3,"esp"),"edx");	# r3
-	&add	("ecx","edx");			# s3
-	&mov	(&DWP($s1,"esp"),"eax");	# s1
-	&mov	(&DWP($s2,"esp"),"ebx");	# s2
-	&mov	(&DWP($s3,"esp"),"ecx");	# s3
-
-	&mov	("eax",&DWP(4*0,"edi"));	# load hash value
-	&mov	("ebx",&DWP(4*1,"edi"));
-	&mov	("ecx",&DWP(4*2,"edi"));
-	&mov	("esi",&DWP(4*3,"edi"));
-	&mov	("edi",&DWP(4*4,"edi"));
-	&jmp	(&label("loop"));
-
-&set_label("loop",32);
-	&add	("eax",&DWP(4*0,"ebp"));	# accumulate input
-	&adc	("ebx",&DWP(4*1,"ebp"));
-	&adc	("ecx",&DWP(4*2,"ebp"));
-	&adc	("esi",&DWP(4*3,"ebp"));
-	&lea	("ebp",&DWP(4*4,"ebp"));
-	&adc	("edi",&wparam(3));		# padbit
-
-	&mov	(&DWP($h0,"esp"),"eax");	# put aside hash[+inp]
-	&mov	(&DWP($h3,"esp"),"esi");
-
-	&mul	(&DWP($r0,"esp"));		# h0*r0
-	 &mov	(&DWP($h4,"esp"),"edi");
-	&mov	("edi","eax");
-	&mov	("eax","ebx");			# h1
-	&mov	("esi","edx");
-	&mul	(&DWP($s3,"esp"));		# h1*s3
-	&add	("edi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("esi","edx");
-	&mul	(&DWP($s2,"esp"));		# h2*s2
-	&add	("edi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("esi","edx");
-	&mul	(&DWP($s1,"esp"));		# h3*s1
-	&add	("edi","eax");
-	 &mov	("eax",&DWP($h0,"esp"));
-	&adc	("esi","edx");
-
-	&mul	(&DWP($r1,"esp"));		# h0*r1
-	 &mov	(&DWP($d0,"esp"),"edi");
-	&xor	("edi","edi");
-	&add	("esi","eax");
-	&mov	("eax","ebx");			# h1
-	&adc	("edi","edx");
-	&mul	(&DWP($r0,"esp"));		# h1*r0
-	&add	("esi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("edi","edx");
-	&mul	(&DWP($s3,"esp"));		# h2*s3
-	&add	("esi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("edi","edx");
-	&mul	(&DWP($s2,"esp"));		# h3*s2
-	&add	("esi","eax");
-	&mov	("eax",&DWP($h4,"esp"));
-	&adc	("edi","edx");
-	&imul	("eax",&DWP($s1,"esp"));	# h4*s1
-	&add	("esi","eax");
-	 &mov	("eax",&DWP($h0,"esp"));
-	&adc	("edi",0);
-
-	&mul	(&DWP($r2,"esp"));		# h0*r2
-	 &mov	(&DWP($d1,"esp"),"esi");
-	&xor	("esi","esi");
-	&add	("edi","eax");
-	&mov	("eax","ebx");			# h1
-	&adc	("esi","edx");
-	&mul	(&DWP($r1,"esp"));		# h1*r1
-	&add	("edi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("esi","edx");
-	&mul	(&DWP($r0,"esp"));		# h2*r0
-	&add	("edi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("esi","edx");
-	&mul	(&DWP($s3,"esp"));		# h3*s3
-	&add	("edi","eax");
-	&mov	("eax",&DWP($h4,"esp"));
-	&adc	("esi","edx");
-	&imul	("eax",&DWP($s2,"esp"));	# h4*s2
-	&add	("edi","eax");
-	 &mov	("eax",&DWP($h0,"esp"));
-	&adc	("esi",0);
-
-	&mul	(&DWP($r3,"esp"));		# h0*r3
-	 &mov	(&DWP($d2,"esp"),"edi");
-	&xor	("edi","edi");
-	&add	("esi","eax");
-	&mov	("eax","ebx");			# h1
-	&adc	("edi","edx");
-	&mul	(&DWP($r2,"esp"));		# h1*r2
-	&add	("esi","eax");
-	&mov	("eax","ecx");			# h2
-	&adc	("edi","edx");
-	&mul	(&DWP($r1,"esp"));		# h2*r1
-	&add	("esi","eax");
-	&mov	("eax",&DWP($h3,"esp"));
-	&adc	("edi","edx");
-	&mul	(&DWP($r0,"esp"));		# h3*r0
-	&add	("esi","eax");
-	 &mov	("ecx",&DWP($h4,"esp"));
-	&adc	("edi","edx");
-
-	&mov	("edx","ecx");
-	&imul	("ecx",&DWP($s3,"esp"));	# h4*s3
-	&add	("esi","ecx");
-	 &mov	("eax",&DWP($d0,"esp"));
-	&adc	("edi",0);
-
-	&imul	("edx",&DWP($r0,"esp"));	# h4*r0
-	&add	("edx","edi");
-
-	&mov	("ebx",&DWP($d1,"esp"));
-	&mov	("ecx",&DWP($d2,"esp"));
-
-	&mov	("edi","edx");			# last reduction step
-	&shr	("edx",2);
-	&and	("edi",3);
-	&lea	("edx",&DWP(0,"edx","edx",4));	# *5
-	&add	("eax","edx");
-	&adc	("ebx",0);
-	&adc	("ecx",0);
-	&adc	("esi",0);
-
-	&cmp	("ebp",&wparam(2));		# done yet?
-	&jne	(&label("loop"));
-
-	&mov	("edx",&wparam(0));		# ctx
-	&stack_pop(16);
-	&mov	(&DWP(4*0,"edx"),"eax");	# store hash value
-	&mov	(&DWP(4*1,"edx"),"ebx");
-	&mov	(&DWP(4*2,"edx"),"ecx");
-	&mov	(&DWP(4*3,"edx"),"esi");
-	&mov	(&DWP(4*4,"edx"),"edi");
-&set_label("nodata");
-&function_end("poly1305_blocks");
-
-&function_begin("poly1305_emit");
-	&mov	("ebp",&wparam(0));		# context
-&set_label("enter_emit");
-	&mov	("edi",&wparam(1));		# output
-	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
-	&mov	("ebx",&DWP(4*1,"ebp"));
-	&mov	("ecx",&DWP(4*2,"ebp"));
-	&mov	("edx",&DWP(4*3,"ebp"));
-	&mov	("esi",&DWP(4*4,"ebp"));
-
-	&add	("eax",5);			# compare to modulus
-	&adc	("ebx",0);
-	&adc	("ecx",0);
-	&adc	("edx",0);
-	&adc	("esi",0);
-	&shr	("esi",2);			# did it carry/borrow?
-	&neg	("esi");			# do we choose hash-modulus?
-
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&and	("ecx","esi");
-	&and	("edx","esi");
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&mov	(&DWP(4*3,"edi"),"edx");
-
-	&not	("esi");			# or original hash value?
-	&mov	("eax",&DWP(4*0,"ebp"));
-	&mov	("ebx",&DWP(4*1,"ebp"));
-	&mov	("ecx",&DWP(4*2,"ebp"));
-	&mov	("edx",&DWP(4*3,"ebp"));
-	&mov	("ebp",&wparam(2));
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&and	("ecx","esi");
-	&and	("edx","esi");
-	&or	("eax",&DWP(4*0,"edi"));
-	&or	("ebx",&DWP(4*1,"edi"));
-	&or	("ecx",&DWP(4*2,"edi"));
-	&or	("edx",&DWP(4*3,"edi"));
-
-	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
-	&adc	("ebx",&DWP(4*1,"ebp"));
-	&adc	("ecx",&DWP(4*2,"ebp"));
-	&adc	("edx",&DWP(4*3,"ebp"));
-
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&mov	(&DWP(4*3,"edi"),"edx");
-&function_end("poly1305_emit");
-
-if ($sse2) {
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int32 h[5];		# current hash value base 2^26
-#	unsigned __int32 is_base2_26;
-#	unsigned __int32 r[4];		# key value base 2^32
-#	unsigned __int32 pad[2];
-#	struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7));
-my $MASK=$T2;	# borrow and keep in mind
-
-&align	(32);
-&function_begin_B("_poly1305_init_sse2");
-	&movdqu		($D4,&QWP(4*6,"edi"));		# key base 2^32
-	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
-	&mov		("ebp","esp");
-	&sub		("esp",16*(9+5));
-	&and		("esp",-16);
-
-	#&pand		($D4,&QWP(96,"ebx"));		# magic mask
-	&movq		($MASK,&QWP(64,"ebx"));
-
-	&movdqa		($D0,$D4);
-	&movdqa		($D1,$D4);
-	&movdqa		($D2,$D4);
-
-	&pand		($D0,$MASK);			# -> base 2^26
-	&psrlq		($D1,26);
-	&psrldq		($D2,6);
-	&pand		($D1,$MASK);
-	&movdqa		($D3,$D2);
-	&psrlq		($D2,4)
-	&psrlq		($D3,30);
-	&pand		($D2,$MASK);
-	&pand		($D3,$MASK);
-	&psrldq		($D4,13);
-
-	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
-	&mov		("ecx",2);
-&set_label("square");
-	&movdqa		(&QWP(16*0,"esp"),$D0);
-	&movdqa		(&QWP(16*1,"esp"),$D1);
-	&movdqa		(&QWP(16*2,"esp"),$D2);
-	&movdqa		(&QWP(16*3,"esp"),$D3);
-	&movdqa		(&QWP(16*4,"esp"),$D4);
-
-	&movdqa		($T1,$D1);
-	&movdqa		($T0,$D2);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D1);			# *5
-	&paddd		($T0,$D2);			# *5
-	&movdqa		(&QWP(16*5,"esp"),$T1);
-	&movdqa		(&QWP(16*6,"esp"),$T0);
-	&movdqa		($T1,$D3);
-	&movdqa		($T0,$D4);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D3);			# *5
-	&paddd		($T0,$D4);			# *5
-	&movdqa		(&QWP(16*7,"esp"),$T1);
-	&movdqa		(&QWP(16*8,"esp"),$T0);
-
-	&pshufd		($T1,$D0,0b01000100);
-	&movdqa		($T0,$D1);
-	&pshufd		($D1,$D1,0b01000100);
-	&pshufd		($D2,$D2,0b01000100);
-	&pshufd		($D3,$D3,0b01000100);
-	&pshufd		($D4,$D4,0b01000100);
-	&movdqa		(&QWP(16*0,"edx"),$T1);
-	&movdqa		(&QWP(16*1,"edx"),$D1);
-	&movdqa		(&QWP(16*2,"edx"),$D2);
-	&movdqa		(&QWP(16*3,"edx"),$D3);
-	&movdqa		(&QWP(16*4,"edx"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&pmuludq	($D4,$D0);			# h4*r0
-	&pmuludq	($D3,$D0);			# h3*r0
-	&pmuludq	($D2,$D0);			# h2*r0
-	&pmuludq	($D1,$D0);			# h1*r0
-	&pmuludq	($D0,$T1);			# h0*r0
-
-sub pmuladd {
-my $load = shift;
-my $base = shift; $base = "esp" if (!defined($base));
-
-	################################################################
-	# As for choice to "rotate" $T0-$T2 in order to move paddq
-	# past next multiplication. While it makes code harder to read
-	# and doesn't have significant effect on most processors, it
-	# makes a lot of difference on Atom, up to 30% improvement.
-
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*3,$base));		# r1*h3
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*2,$base));		# r1*h2
-	&paddq		($D4,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*1,$base));		# r1*h1
-	&paddq		($D3,$T1);
-	&$load		($T1,5);			# s1
-	&pmuludq	($T0,&QWP(16*0,$base));		# r1*h0
-	&paddq		($D2,$T2);
-	&pmuludq	($T1,&QWP(16*4,$base));		# s1*h4
-	 &$load		($T2,2);			# r2^n
-	&paddq		($D1,$T0);
-
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*2,$base));		# r2*h2
-	 &paddq		($D0,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*1,$base));		# r2*h1
-	&paddq		($D4,$T2);
-	&$load		($T2,6);			# s2^n
-	&pmuludq	($T1,&QWP(16*0,$base));		# r2*h0
-	&paddq		($D3,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*4,$base));		# s2*h4
-	&paddq		($D2,$T1);
-	&pmuludq	($T0,&QWP(16*3,$base));		# s2*h3
-	 &$load		($T1,3);			# r3^n
-	&paddq		($D1,$T2);
-
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*1,$base));		# r3*h1
-	 &paddq		($D0,$T0);
-	&$load		($T0,7);			# s3^n
-	&pmuludq	($T2,&QWP(16*0,$base));		# r3*h0
-	&paddq		($D4,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*4,$base));		# s3*h4
-	&paddq		($D3,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*3,$base));		# s3*h3
-	&paddq		($D2,$T0);
-	&pmuludq	($T2,&QWP(16*2,$base));		# s3*h2
-	 &$load		($T0,4);			# r4^n
-	&paddq		($D1,$T1);
-
-	&$load		($T1,8);			# s4^n
-	&pmuludq	($T0,&QWP(16*0,$base));		# r4*h0
-	 &paddq		($D0,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&QWP(16*4,$base));		# s4*h4
-	&paddq		($D4,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&QWP(16*1,$base));		# s4*h1
-	&paddq		($D3,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&QWP(16*2,$base));		# s4*h2
-	&paddq		($D0,$T2);
-	&pmuludq	($T1,&QWP(16*3,$base));		# s4*h3
-	 &movdqa	($MASK,&QWP(64,"ebx"));
-	&paddq		($D1,$T0);
-	&paddq		($D2,$T1);
-}
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&movdqa ($reg,&QWP(16*$i,"esp"));
-			     },"edx");
-
-sub lazy_reduction {
-my $extra = shift;
-my $paddx = defined($extra) ? paddq : paddd;
-
-	################################################################
-	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	# and P. Schwabe
-
-	 &movdqa	($T0,$D3);
-	 &pand		($D3,$MASK);
-	 &psrlq		($T0,26);
-	 &$extra	()				if (defined($extra));
-	 &paddq		($T0,$D4);			# h3 -> h4
-	&movdqa		($T1,$D0);
-	&pand		($D0,$MASK);
-	&psrlq		($T1,26);
-	 &movdqa	($D4,$T0);
-	&paddq		($T1,$D1);			# h0 -> h1
-	 &psrlq		($T0,26);
-	 &pand		($D4,$MASK);
-	&movdqa		($D1,$T1);
-	&psrlq		($T1,26);
-	 &paddd		($D0,$T0);			# favour paddd when
-							# possible, because
-							# paddq is "broken"
-							# on Atom
-	 &psllq		($T0,2);
-	&paddq		($T1,$D2);			# h1 -> h2
-	 &$paddx	($T0,$D0);			# h4 -> h0
-	&pand		($D1,$MASK);
-	&movdqa		($D2,$T1);
-	&psrlq		($T1,26);
-	&pand		($D2,$MASK);
-	&paddd		($T1,$D3);			# h2 -> h3
-	 &movdqa	($D0,$T0);
-	 &psrlq		($T0,26);
-	&movdqa		($D3,$T1);
-	&psrlq		($T1,26);
-	 &pand		($D0,$MASK);
-	 &paddd		($D1,$T0);			# h0 -> h1
-	&pand		($D3,$MASK);
-	&paddd		($D4,$T1);			# h3 -> h4
-}
-	&lazy_reduction	();
-
-	&dec		("ecx");
-	&jz		(&label("square_break"));
-
-	&punpcklqdq	($D0,&QWP(16*0,"esp"));		# 0:r^1:0:r^2
-	&punpcklqdq	($D1,&QWP(16*1,"esp"));
-	&punpcklqdq	($D2,&QWP(16*2,"esp"));
-	&punpcklqdq	($D3,&QWP(16*3,"esp"));
-	&punpcklqdq	($D4,&QWP(16*4,"esp"));
-	&jmp		(&label("square"));
-
-&set_label("square_break");
-	&psllq		($D0,32);			# -> r^3:0:r^4:0
-	&psllq		($D1,32);
-	&psllq		($D2,32);
-	&psllq		($D3,32);
-	&psllq		($D4,32);
-	&por		($D0,&QWP(16*0,"esp"));		# r^3:r^1:r^4:r^2
-	&por		($D1,&QWP(16*1,"esp"));
-	&por		($D2,&QWP(16*2,"esp"));
-	&por		($D3,&QWP(16*3,"esp"));
-	&por		($D4,&QWP(16*4,"esp"));
-
-	&pshufd		($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
-	&pshufd		($D1,$D1,0b10001101);
-	&pshufd		($D2,$D2,0b10001101);
-	&pshufd		($D3,$D3,0b10001101);
-	&pshufd		($D4,$D4,0b10001101);
-
-	&movdqu		(&QWP(16*0,"edi"),$D0);		# save the table
-	&movdqu		(&QWP(16*1,"edi"),$D1);
-	&movdqu		(&QWP(16*2,"edi"),$D2);
-	&movdqu		(&QWP(16*3,"edi"),$D3);
-	&movdqu		(&QWP(16*4,"edi"),$D4);
-
-	&movdqa		($T1,$D1);
-	&movdqa		($T0,$D2);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D1);			# *5
-	&paddd		($T0,$D2);			# *5
-	&movdqu		(&QWP(16*5,"edi"),$T1);
-	&movdqu		(&QWP(16*6,"edi"),$T0);
-	&movdqa		($T1,$D3);
-	&movdqa		($T0,$D4);
-	&pslld		($T1,2);
-	&pslld		($T0,2);
-	&paddd		($T1,$D3);			# *5
-	&paddd		($T0,$D4);			# *5
-	&movdqu		(&QWP(16*7,"edi"),$T1);
-	&movdqu		(&QWP(16*8,"edi"),$T0);
-
-	&mov		("esp","ebp");
-	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
-	&ret		();
-&function_end_B("_poly1305_init_sse2");
-
-&align	(32);
-&function_begin("_poly1305_blocks_sse2");
-	&mov	("edi",&wparam(0));			# ctx
-	&mov	("esi",&wparam(1));			# inp
-	&mov	("ecx",&wparam(2));			# len
-
-	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
-	&and	("ecx",-16);
-	&jz	(&label("nodata"));
-	&cmp	("ecx",64);
-	&jae	(&label("enter_sse2"));
-	&test	("eax","eax");				# is_base2_26?
-	&jz	(&label("enter_blocks"));
-
-&set_label("enter_sse2",16);
-	&call	(&label("pic_point"));
-&set_label("pic_point");
-	&blindpop("ebx");
-	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
-
-	&test	("eax","eax");				# is_base2_26?
-	&jnz	(&label("base2_26"));
-
-	&call	("_poly1305_init_sse2");
-
-	################################################# base 2^32 -> base 2^26
-	&mov	("eax",&DWP(0,"edi"));
-	&mov	("ecx",&DWP(3,"edi"));
-	&mov	("edx",&DWP(6,"edi"));
-	&mov	("esi",&DWP(9,"edi"));
-	&mov	("ebp",&DWP(13,"edi"));
-	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
-
-	&shr	("ecx",2);
-	&and	("eax",0x3ffffff);
-	&shr	("edx",4);
-	&and	("ecx",0x3ffffff);
-	&shr	("esi",6);
-	&and	("edx",0x3ffffff);
-
-	&movd	($D0,"eax");
-	&movd	($D1,"ecx");
-	&movd	($D2,"edx");
-	&movd	($D3,"esi");
-	&movd	($D4,"ebp");
-
-	&mov	("esi",&wparam(1));			# [reload] inp
-	&mov	("ecx",&wparam(2));			# [reload] len
-	&jmp	(&label("base2_32"));
-
-&set_label("base2_26",16);
-	&movd	($D0,&DWP(4*0,"edi"));			# load hash value
-	&movd	($D1,&DWP(4*1,"edi"));
-	&movd	($D2,&DWP(4*2,"edi"));
-	&movd	($D3,&DWP(4*3,"edi"));
-	&movd	($D4,&DWP(4*4,"edi"));
-	&movdqa	($MASK,&QWP(64,"ebx"));
-
-&set_label("base2_32");
-	&mov	("eax",&wparam(3));			# padbit
-	&mov	("ebp","esp");
-
-	&sub	("esp",16*(5+5+5+9+9));
-	&and	("esp",-16);
-
-	&lea	("edi",&DWP(16*3,"edi"));		# size optimization
-	&shl	("eax",24);				# padbit
-
-	&test	("ecx",31);
-	&jz	(&label("even"));
-
-	################################################################
-	# process single block, with SSE2, because it's still faster
-	# even though half of result is discarded
-
-	&movdqu		($T1,&QWP(0,"esi"));		# input
-	&lea		("esi",&DWP(16,"esi"));
-
-	&movdqa		($T0,$T1);			# -> base 2^26 ...
-	&pand		($T1,$MASK);
-	&paddd		($D0,$T1);			# ... and accumuate
-
-	&movdqa		($T1,$T0);
-	&psrlq		($T0,26);
-	&psrldq		($T1,6);
-	&pand		($T0,$MASK);
-	&paddd		($D1,$T0);
-
-	&movdqa		($T0,$T1);
-	&psrlq		($T1,4);
-	&pand		($T1,$MASK);
-	&paddd		($D2,$T1);
-
-	&movdqa		($T1,$T0);
-	&psrlq		($T0,30);
-	&pand		($T0,$MASK);
-	&psrldq		($T1,7);
-	&paddd		($D3,$T0);
-
-	&movd		($T0,"eax");			# padbit
-	&paddd		($D4,$T1);
-	 &movd		($T1,&DWP(16*0+12,"edi"));	# r0
-	&paddd		($D4,$T0);
-
-	&movdqa		(&QWP(16*0,"esp"),$D0);
-	&movdqa		(&QWP(16*1,"esp"),$D1);
-	&movdqa		(&QWP(16*2,"esp"),$D2);
-	&movdqa		(&QWP(16*3,"esp"),$D3);
-	&movdqa		(&QWP(16*4,"esp"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&pmuludq	($D0,$T1);			# h4*r0
-	&pmuludq	($D1,$T1);			# h3*r0
-	&pmuludq	($D2,$T1);			# h2*r0
-	 &movd		($T0,&DWP(16*1+12,"edi"));	# r1
-	&pmuludq	($D3,$T1);			# h1*r0
-	&pmuludq	($D4,$T1);			# h0*r0
-
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&movd ($reg,&DWP(16*$i+12,"edi"));
-			     });
-
-	&lazy_reduction	();
-
-	&sub		("ecx",16);
-	&jz		(&label("done"));
-
-&set_label("even");
-	&lea		("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization
-	&lea		("eax",&DWP(-16*2,"esi"));
-	&sub		("ecx",64);
-
-	################################################################
-	# expand and copy pre-calculated table to stack
-
-	&movdqu		($T0,&QWP(16*0,"edi"));		# r^1:r^2:r^3:r^4
-	&pshufd		($T1,$T0,0b01000100);		# duplicate r^3:r^4
-	&cmovb		("esi","eax");
-	&pshufd		($T0,$T0,0b11101110);		# duplicate r^1:r^2
-	&movdqa		(&QWP(16*0,"edx"),$T1);
-	&lea		("eax",&DWP(16*10,"esp"));
-	&movdqu		($T1,&QWP(16*1,"edi"));
-	&movdqa		(&QWP(16*(0-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*1,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*2,"edi"));
-	&movdqa		(&QWP(16*(1-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*2,"edx"),$T1);
-	&movdqu		($T1,&QWP(16*3,"edi"));
-	&movdqa		(&QWP(16*(2-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*3,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*4,"edi"));
-	&movdqa		(&QWP(16*(3-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*4,"edx"),$T1);
-	&movdqu		($T1,&QWP(16*5,"edi"));
-	&movdqa		(&QWP(16*(4-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*5,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*6,"edi"));
-	&movdqa		(&QWP(16*(5-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*6,"edx"),$T1);
-	&movdqu		($T1,&QWP(16*7,"edi"));
-	&movdqa		(&QWP(16*(6-9),"edx"),$T0);
-	&pshufd		($T0,$T1,0b01000100);
-	&pshufd		($T1,$T1,0b11101110);
-	&movdqa		(&QWP(16*7,"edx"),$T0);
-	&movdqu		($T0,&QWP(16*8,"edi"));
-	&movdqa		(&QWP(16*(7-9),"edx"),$T1);
-	&pshufd		($T1,$T0,0b01000100);
-	&pshufd		($T0,$T0,0b11101110);
-	&movdqa		(&QWP(16*8,"edx"),$T1);
-	&movdqa		(&QWP(16*(8-9),"edx"),$T0);
-
-sub load_input {
-my ($inpbase,$offbase)=@_;
-
-	&movdqu		($T0,&QWP($inpbase+0,"esi"));	# load input
-	&movdqu		($T1,&QWP($inpbase+16,"esi"));
-	&lea		("esi",&DWP(16*2,"esi"));
-
-	&movdqa		(&QWP($offbase+16*2,"esp"),$D2);
-	&movdqa		(&QWP($offbase+16*3,"esp"),$D3);
-	&movdqa		(&QWP($offbase+16*4,"esp"),$D4);
-
-	&movdqa		($D2,$T0);			# splat input
-	&movdqa		($D3,$T1);
-	&psrldq		($D2,6);
-	&psrldq		($D3,6);
-	&movdqa		($D4,$T0);
-	&punpcklqdq	($D2,$D3);			# 2:3
-	&punpckhqdq	($D4,$T1);			# 4
-	&punpcklqdq	($T0,$T1);			# 0:1
-
-	&movdqa		($D3,$D2);
-	&psrlq		($D2,4);
-	&psrlq		($D3,30);
-	&movdqa		($T1,$T0);
-	&psrlq		($D4,40);			# 4
-	&psrlq		($T1,26);
-	&pand		($T0,$MASK);			# 0
-	&pand		($T1,$MASK);			# 1
-	&pand		($D2,$MASK);			# 2
-	&pand		($D3,$MASK);			# 3
-	&por		($D4,&QWP(0,"ebx"));		# padbit, yes, always
-
-	&movdqa		(&QWP($offbase+16*0,"esp"),$D0)	if ($offbase);
-	&movdqa		(&QWP($offbase+16*1,"esp"),$D1)	if ($offbase);
-}
-	&load_input	(16*2,16*5);
-
-	&jbe		(&label("skip_loop"));
-	&jmp		(&label("loop"));
-
-&set_label("loop",32);
-	################################################################
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	#   \___________________/
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	#   \___________________/ \____________________/
-	################################################################
-
-	&movdqa		($T2,&QWP(16*(0-9),"edx"));	# r0^2
-	&movdqa		(&QWP(16*1,"eax"),$T1);
-	&movdqa		(&QWP(16*2,"eax"),$D2);
-	&movdqa		(&QWP(16*3,"eax"),$D3);
-	&movdqa		(&QWP(16*4,"eax"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
-	# d3 = h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
-	# d2 = h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
-	# d1 = h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	# d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	&movdqa		($D1,$T0);
-	&pmuludq	($T0,$T2);			# h0*r0
-	&movdqa		($D0,$T1);
-	&pmuludq	($T1,$T2);			# h1*r0
-	&pmuludq	($D2,$T2);			# h2*r0
-	&pmuludq	($D3,$T2);			# h3*r0
-	&pmuludq	($D4,$T2);			# h4*r0
-
-sub pmuladd_alt {
-my $addr = shift;
-
-	&pmuludq	($D0,&$addr(8));		# h1*s4
-	&movdqa		($T2,$D1);
-	&pmuludq	($D1,&$addr(1));		# h0*r1
-	&paddq		($D0,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(2));		# h0*r2
-	&paddq		($D1,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(3));		# h0*r3
-	&paddq		($D2,$T2);
-	 &movdqa	($T2,&QWP(16*1,"eax"));		# pull h1
-	&pmuludq	($T1,&$addr(4));		# h0*r4
-	&paddq		($D3,$T0);
-
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(1));		# h1*r1
-	 &paddq		($D4,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(2));		# h1*r2
-	&paddq		($D2,$T2);
-	&movdqa		($T2,&QWP(16*2,"eax"));		# pull h2
-	&pmuludq	($T1,&$addr(3));		# h1*r3
-	&paddq		($D3,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(7));		# h2*s3
-	&paddq		($D4,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(8));		# h2*s4
-	&paddq		($D0,$T2);
-
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&$addr(1));		# h2*r1
-	 &paddq		($D1,$T0);
-	&movdqa		($T0,&QWP(16*3,"eax"));		# pull h3
-	&pmuludq	($T2,&$addr(2));		# h2*r2
-	&paddq		($D3,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(6));		# h3*s2
-	&paddq		($D4,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&$addr(7));		# h3*s3
-	&paddq		($D0,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(8));		# h3*s4
-	&paddq		($D1,$T1);
-
-	&movdqa		($T1,&QWP(16*4,"eax"));		# pull h4
-	&pmuludq	($T0,&$addr(1));		# h3*r1
-	 &paddq		($D2,$T2);
-	&movdqa		($T2,$T1);
-	&pmuludq	($T1,&$addr(8));		# h4*s4
-	&paddq		($D4,$T0);
-	&movdqa		($T0,$T2);
-	&pmuludq	($T2,&$addr(5));		# h4*s1
-	&paddq		($D3,$T1);
-	&movdqa		($T1,$T0);
-	&pmuludq	($T0,&$addr(6));		# h4*s2
-	&paddq		($D0,$T2);
-	 &movdqa	($MASK,&QWP(64,"ebx"));
-	&pmuludq	($T1,&$addr(7));		# h4*s3
-	&paddq		($D1,$T0);
-	&paddq		($D2,$T1);
-}
-	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*($i-9),"edx");	});
-
-	&load_input	(-16*2,0);
-	&lea		("eax",&DWP(-16*2,"esi"));
-	&sub		("ecx",64);
-
-	&paddd		($T0,&QWP(16*(5+0),"esp"));	# add hash value
-	&paddd		($T1,&QWP(16*(5+1),"esp"));
-	&paddd		($D2,&QWP(16*(5+2),"esp"));
-	&paddd		($D3,&QWP(16*(5+3),"esp"));
-	&paddd		($D4,&QWP(16*(5+4),"esp"));
-
-	&cmovb		("esi","eax");
-	&lea		("eax",&DWP(16*10,"esp"));
-
-	&movdqa		($T2,&QWP(16*0,"edx"));		# r0^4
-	&movdqa		(&QWP(16*1,"esp"),$D1);
-	&movdqa		(&QWP(16*1,"eax"),$T1);
-	&movdqa		(&QWP(16*2,"eax"),$D2);
-	&movdqa		(&QWP(16*3,"eax"),$D3);
-	&movdqa		(&QWP(16*4,"eax"),$D4);
-
-	################################################################
-	# d4 += h4*r0 + h0*r4   + h1*r3   + h2*r2   + h3*r1
-	# d3 += h3*r0 + h0*r3   + h1*r2   + h2*r1   + h4*5*r4
-	# d2 += h2*r0 + h0*r2   + h1*r1   + h3*5*r4 + h4*5*r3
-	# d1 += h1*r0 + h0*r1   + h2*5*r4 + h3*5*r3 + h4*5*r2
-	# d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
-
-	&movdqa		($D1,$T0);
-	&pmuludq	($T0,$T2);			# h0*r0
-	&paddq		($T0,$D0);
-	&movdqa		($D0,$T1);
-	&pmuludq	($T1,$T2);			# h1*r0
-	&pmuludq	($D2,$T2);			# h2*r0
-	&pmuludq	($D3,$T2);			# h3*r0
-	&pmuludq	($D4,$T2);			# h4*r0
-
-	&paddq		($T1,&QWP(16*1,"esp"));
-	&paddq		($D2,&QWP(16*2,"esp"));
-	&paddq		($D3,&QWP(16*3,"esp"));
-	&paddq		($D4,&QWP(16*4,"esp"));
-
-	&pmuladd_alt	(sub {	my $i=shift; &QWP(16*$i,"edx");	});
-
-	&lazy_reduction	();
-
-	&load_input	(16*2,16*5);
-
-	&ja		(&label("loop"));
-
-&set_label("skip_loop");
-	################################################################
-	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	 &pshufd	($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n
-	&add		("ecx",32);
-	&jnz		(&label("long_tail"));
-
-	&paddd		($T0,$D0);			# add hash value
-	&paddd		($T1,$D1);
-	&paddd		($D2,&QWP(16*7,"esp"));
-	&paddd		($D3,&QWP(16*8,"esp"));
-	&paddd		($D4,&QWP(16*9,"esp"));
-
-&set_label("long_tail");
-
-	&movdqa		(&QWP(16*0,"eax"),$T0);
-	&movdqa		(&QWP(16*1,"eax"),$T1);
-	&movdqa		(&QWP(16*2,"eax"),$D2);
-	&movdqa		(&QWP(16*3,"eax"),$D3);
-	&movdqa		(&QWP(16*4,"eax"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&pmuludq	($T0,$T2);			# h0*r0
-	&pmuludq	($T1,$T2);			# h1*r0
-	&pmuludq	($D2,$T2);			# h2*r0
-	&movdqa		($D0,$T0);
-	 &pshufd	($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n
-	&pmuludq	($D3,$T2);			# h3*r0
-	&movdqa		($D1,$T1);
-	&pmuludq	($D4,$T2);			# h4*r0
-
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&pshufd ($reg,&QWP(16*($i-9),"edx"),0x10);
-			     },"eax");
-
-	&jz		(&label("short_tail"));
-
-	&load_input	(-16*2,0);
-
-	 &pshufd	($T2,&QWP(16*0,"edx"),0x10);	# r0^n
-	&paddd		($T0,&QWP(16*5,"esp"));		# add hash value
-	&paddd		($T1,&QWP(16*6,"esp"));
-	&paddd		($D2,&QWP(16*7,"esp"));
-	&paddd		($D3,&QWP(16*8,"esp"));
-	&paddd		($D4,&QWP(16*9,"esp"));
-
-	################################################################
-	# multiply inp[0:1] by r^4:r^3 and accumulate
-
-	&movdqa		(&QWP(16*0,"esp"),$T0);
-	&pmuludq	($T0,$T2);			# h0*r0
-	&movdqa		(&QWP(16*1,"esp"),$T1);
-	&pmuludq	($T1,$T2);			# h1*r0
-	&paddq		($D0,$T0);
-	&movdqa		($T0,$D2);
-	&pmuludq	($D2,$T2);			# h2*r0
-	&paddq		($D1,$T1);
-	&movdqa		($T1,$D3);
-	&pmuludq	($D3,$T2);			# h3*r0
-	&paddq		($D2,&QWP(16*2,"esp"));
-	&movdqa		(&QWP(16*2,"esp"),$T0);
-	 &pshufd	($T0,&QWP(16*1,"edx"),0x10);	# r1^n
-	&paddq		($D3,&QWP(16*3,"esp"));
-	&movdqa		(&QWP(16*3,"esp"),$T1);
-	&movdqa		($T1,$D4);
-	&pmuludq	($D4,$T2);			# h4*r0
-	&paddq		($D4,&QWP(16*4,"esp"));
-	&movdqa		(&QWP(16*4,"esp"),$T1);
-
-	&pmuladd	(sub {	my ($reg,$i)=@_;
-				&pshufd ($reg,&QWP(16*$i,"edx"),0x10);
-			     });
-
-&set_label("short_tail");
-
-	################################################################
-	# horizontal addition
-
-	&pshufd		($T1,$D4,0b01001110);
-	&pshufd		($T0,$D3,0b01001110);
-	&paddq		($D4,$T1);
-	&paddq		($D3,$T0);
-	&pshufd		($T1,$D0,0b01001110);
-	&pshufd		($T0,$D1,0b01001110);
-	&paddq		($D0,$T1);
-	&paddq		($D1,$T0);
-	&pshufd		($T1,$D2,0b01001110);
-	#&paddq		($D2,$T1);
-
-	&lazy_reduction	(sub { &paddq ($D2,$T1) });
-
-&set_label("done");
-	&movd		(&DWP(-16*3+4*0,"edi"),$D0);	# store hash value
-	&movd		(&DWP(-16*3+4*1,"edi"),$D1);
-	&movd		(&DWP(-16*3+4*2,"edi"),$D2);
-	&movd		(&DWP(-16*3+4*3,"edi"),$D3);
-	&movd		(&DWP(-16*3+4*4,"edi"),$D4);
-	&mov	("esp","ebp");
-&set_label("nodata");
-&function_end("_poly1305_blocks_sse2");
-
-&align	(32);
-&function_begin("_poly1305_emit_sse2");
-	&mov	("ebp",&wparam(0));		# context
-
-	&cmp	(&DWP(4*5,"ebp"),0);		# is_base2_26?
-	&je	(&label("enter_emit"));
-
-	&mov	("eax",&DWP(4*0,"ebp"));	# load hash value
-	&mov	("edi",&DWP(4*1,"ebp"));
-	&mov	("ecx",&DWP(4*2,"ebp"));
-	&mov	("edx",&DWP(4*3,"ebp"));
-	&mov	("esi",&DWP(4*4,"ebp"));
-
-	&mov	("ebx","edi");			# base 2^26 -> base 2^32
-	&shl	("edi",26);
-	&shr	("ebx",6);
-	&add	("eax","edi");
-	&mov	("edi","ecx");
-	&adc	("ebx",0);
-
-	&shl	("edi",20);
-	&shr	("ecx",12);
-	&add	("ebx","edi");
-	&mov	("edi","edx");
-	&adc	("ecx",0);
-
-	&shl	("edi",14);
-	&shr	("edx",18);
-	&add	("ecx","edi");
-	&mov	("edi","esi");
-	&adc	("edx",0);
-
-	&shl	("edi",8);
-	&shr	("esi",24);
-	&add	("edx","edi");
-	&adc	("esi",0);			# can be partially reduced
-
-	&mov	("edi","esi");			# final reduction
-	&and	("esi",3);
-	&shr	("edi",2);
-	&lea	("ebp",&DWP(0,"edi","edi",4));	# *5
-	 &mov	("edi",&wparam(1));		# output
-	add	("eax","ebp");
-	 &mov	("ebp",&wparam(2));		# key
-	adc	("ebx",0);
-	adc	("ecx",0);
-	adc	("edx",0);
-
-	&movd	($D0,"eax");			# offload original hash value
-	&add	("eax",5);			# compare to modulus
-	&movd	($D1,"ebx");
-	&adc	("ebx",0);
-	&movd	($D2,"ecx");
-	&adc	("ecx",0);
-	&movd	($D3,"edx");
-	&adc	("edx",0);
-	&adc	("esi",0);
-	&shr	("esi",2);			# did it carry/borrow?
-
-	&neg	("esi");			# do we choose (hash-modulus) ...
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&and	("ecx","esi");
-	&and	("edx","esi");
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&movd	("eax",$D0);
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&movd	("ebx",$D1);
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&movd	("ecx",$D2);
-	&mov	(&DWP(4*3,"edi"),"edx");
-	&movd	("edx",$D3);
-
-	&not	("esi");			# ... or original hash value?
-	&and	("eax","esi");
-	&and	("ebx","esi");
-	&or	("eax",&DWP(4*0,"edi"));
-	&and	("ecx","esi");
-	&or	("ebx",&DWP(4*1,"edi"));
-	&and	("edx","esi");
-	&or	("ecx",&DWP(4*2,"edi"));
-	&or	("edx",&DWP(4*3,"edi"));
-
-	&add	("eax",&DWP(4*0,"ebp"));	# accumulate key
-	&adc	("ebx",&DWP(4*1,"ebp"));
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&adc	("ecx",&DWP(4*2,"ebp"));
-	&mov	(&DWP(4*1,"edi"),"ebx");
-	&adc	("edx",&DWP(4*3,"ebp"));
-	&mov	(&DWP(4*2,"edi"),"ecx");
-	&mov	(&DWP(4*3,"edi"),"edx");
-&function_end("_poly1305_emit_sse2");
-
-if ($avx>1) {
-########################################################################
-# Note that poly1305_init_avx2 operates on %xmm, I could have used
-# poly1305_init_sse2...
-
-&align	(32);
-&function_begin_B("_poly1305_init_avx2");
-	&vmovdqu	($D4,&QWP(4*6,"edi"));		# key base 2^32
-	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
-	&mov		("ebp","esp");
-	&sub		("esp",16*(9+5));
-	&and		("esp",-16);
-
-	#&vpand		($D4,$D4,&QWP(96,"ebx"));	# magic mask
-	&vmovdqa	($MASK,&QWP(64,"ebx"));
-
-	&vpand		($D0,$D4,$MASK);		# -> base 2^26
-	&vpsrlq		($D1,$D4,26);
-	&vpsrldq	($D3,$D4,6);
-	&vpand		($D1,$D1,$MASK);
-	&vpsrlq		($D2,$D3,4)
-	&vpsrlq		($D3,$D3,30);
-	&vpand		($D2,$D2,$MASK);
-	&vpand		($D3,$D3,$MASK);
-	&vpsrldq	($D4,$D4,13);
-
-	&lea		("edx",&DWP(16*9,"esp"));	# size optimization
-	&mov		("ecx",2);
-&set_label("square");
-	&vmovdqa	(&QWP(16*0,"esp"),$D0);
-	&vmovdqa	(&QWP(16*1,"esp"),$D1);
-	&vmovdqa	(&QWP(16*2,"esp"),$D2);
-	&vmovdqa	(&QWP(16*3,"esp"),$D3);
-	&vmovdqa	(&QWP(16*4,"esp"),$D4);
-
-	&vpslld		($T1,$D1,2);
-	&vpslld		($T0,$D2,2);
-	&vpaddd		($T1,$T1,$D1);			# *5
-	&vpaddd		($T0,$T0,$D2);			# *5
-	&vmovdqa	(&QWP(16*5,"esp"),$T1);
-	&vmovdqa	(&QWP(16*6,"esp"),$T0);
-	&vpslld		($T1,$D3,2);
-	&vpslld		($T0,$D4,2);
-	&vpaddd		($T1,$T1,$D3);			# *5
-	&vpaddd		($T0,$T0,$D4);			# *5
-	&vmovdqa	(&QWP(16*7,"esp"),$T1);
-	&vmovdqa	(&QWP(16*8,"esp"),$T0);
-
-	&vpshufd	($T0,$D0,0b01000100);
-	&vmovdqa	($T1,$D1);
-	&vpshufd	($D1,$D1,0b01000100);
-	&vpshufd	($D2,$D2,0b01000100);
-	&vpshufd	($D3,$D3,0b01000100);
-	&vpshufd	($D4,$D4,0b01000100);
-	&vmovdqa	(&QWP(16*0,"edx"),$T0);
-	&vmovdqa	(&QWP(16*1,"edx"),$D1);
-	&vmovdqa	(&QWP(16*2,"edx"),$D2);
-	&vmovdqa	(&QWP(16*3,"edx"),$D3);
-	&vmovdqa	(&QWP(16*4,"edx"),$D4);
-
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	&vpmuludq	($D4,$D4,$D0);			# h4*r0
-	&vpmuludq	($D3,$D3,$D0);			# h3*r0
-	&vpmuludq	($D2,$D2,$D0);			# h2*r0
-	&vpmuludq	($D1,$D1,$D0);			# h1*r0
-	&vpmuludq	($D0,$T0,$D0);			# h0*r0
-
-	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# r1*h3
-	&vpaddq		($D4,$D4,$T0);
-	&vpmuludq	($T2,$T1,&QWP(16*2,"edx"));	# r1*h2
-	&vpaddq		($D3,$D3,$T2);
-	&vpmuludq	($T0,$T1,&QWP(16*1,"edx"));	# r1*h1
-	&vpaddq		($D2,$D2,$T0);
-	&vmovdqa	($T2,&QWP(16*5,"esp"));		# s1
-	&vpmuludq	($T1,$T1,&QWP(16*0,"edx"));	# r1*h0
-	&vpaddq		($D1,$D1,$T1);
-	 &vmovdqa	($T0,&QWP(16*2,"esp"));		# r2
-	&vpmuludq	($T2,$T2,&QWP(16*4,"edx"));	# s1*h4
-	&vpaddq		($D0,$D0,$T2);
-
-	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# r2*h2
-	&vpaddq		($D4,$D4,$T1);
-	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r2*h1
-	&vpaddq		($D3,$D3,$T2);
-	&vmovdqa	($T1,&QWP(16*6,"esp"));		# s2
-	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r2*h0
-	&vpaddq		($D2,$D2,$T0);
-	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s2*h4
-	&vpaddq		($D1,$D1,$T2);
-	 &vmovdqa	($T0,&QWP(16*3,"esp"));		# r3
-	&vpmuludq	($T1,$T1,&QWP(16*3,"edx"));	# s2*h3
-	&vpaddq		($D0,$D0,$T1);
-
-	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# r3*h1
-	&vpaddq		($D4,$D4,$T2);
-	&vmovdqa	($T1,&QWP(16*7,"esp"));		# s3
-	&vpmuludq	($T0,$T0,&QWP(16*0,"edx"));	# r3*h0
-	&vpaddq		($D3,$D3,$T0);
-	&vpmuludq	($T2,$T1,&QWP(16*4,"edx"));	# s3*h4
-	&vpaddq		($D2,$D2,$T2);
-	&vpmuludq	($T0,$T1,&QWP(16*3,"edx"));	# s3*h3
-	&vpaddq		($D1,$D1,$T0);
-	 &vmovdqa	($T2,&QWP(16*4,"esp"));		# r4
-	&vpmuludq	($T1,$T1,&QWP(16*2,"edx"));	# s3*h2
-	&vpaddq		($D0,$D0,$T1);
-
-	&vmovdqa	($T0,&QWP(16*8,"esp"));		# s4
-	&vpmuludq	($T2,$T2,&QWP(16*0,"edx"));	# r4*h0
-	&vpaddq		($D4,$D4,$T2);
-	&vpmuludq	($T1,$T0,&QWP(16*4,"edx"));	# s4*h4
-	&vpaddq		($D3,$D3,$T1);
-	&vpmuludq	($T2,$T0,&QWP(16*1,"edx"));	# s4*h1
-	&vpaddq		($D0,$D0,$T2);
-	&vpmuludq	($T1,$T0,&QWP(16*2,"edx"));	# s4*h2
-	&vpaddq		($D1,$D1,$T1);
-	 &vmovdqa	($MASK,&QWP(64,"ebx"));
-	&vpmuludq	($T0,$T0,&QWP(16*3,"edx"));	# s4*h3
-	&vpaddq		($D2,$D2,$T0);
-
-	################################################################
-	# lazy reduction
-	 &vpsrlq	($T0,$D3,26);
-	 &vpand		($D3,$D3,$MASK);
-	&vpsrlq		($T1,$D0,26);
-	&vpand		($D0,$D0,$MASK);
-	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
-	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
-	 &vpsrlq	($T0,$D4,26);
-	 &vpand		($D4,$D4,$MASK);
-	&vpsrlq		($T1,$D1,26);
-	&vpand		($D1,$D1,$MASK);
-	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
-	 &vpaddd	($D0,$D0,$T0);
-	 &vpsllq	($T0,$T0,2);
-	&vpsrlq		($T1,$D2,26);
-	&vpand		($D2,$D2,$MASK);
-	 &vpaddd	($D0,$D0,$T0);			# h4 -> h0
-	&vpaddd		($D3,$D3,$T1);			# h2 -> h3
-	&vpsrlq		($T1,$D3,26);
-	 &vpsrlq	($T0,$D0,26);
-	 &vpand		($D0,$D0,$MASK);
-	&vpand		($D3,$D3,$MASK);
-	 &vpaddd	($D1,$D1,$T0);			# h0 -> h1
-	&vpaddd		($D4,$D4,$T1);			# h3 -> h4
-
-	&dec		("ecx");
-	&jz		(&label("square_break"));
-
-	&vpunpcklqdq	($D0,$D0,&QWP(16*0,"esp"));	# 0:r^1:0:r^2
-	&vpunpcklqdq	($D1,$D1,&QWP(16*1,"esp"));
-	&vpunpcklqdq	($D2,$D2,&QWP(16*2,"esp"));
-	&vpunpcklqdq	($D3,$D3,&QWP(16*3,"esp"));
-	&vpunpcklqdq	($D4,$D4,&QWP(16*4,"esp"));
-	&jmp		(&label("square"));
-
-&set_label("square_break");
-	&vpsllq		($D0,$D0,32);			# -> r^3:0:r^4:0
-	&vpsllq		($D1,$D1,32);
-	&vpsllq		($D2,$D2,32);
-	&vpsllq		($D3,$D3,32);
-	&vpsllq		($D4,$D4,32);
-	&vpor		($D0,$D0,&QWP(16*0,"esp"));	# r^3:r^1:r^4:r^2
-	&vpor		($D1,$D1,&QWP(16*1,"esp"));
-	&vpor		($D2,$D2,&QWP(16*2,"esp"));
-	&vpor		($D3,$D3,&QWP(16*3,"esp"));
-	&vpor		($D4,$D4,&QWP(16*4,"esp"));
-
-	&vpshufd	($D0,$D0,0b10001101);		# -> r^1:r^2:r^3:r^4
-	&vpshufd	($D1,$D1,0b10001101);
-	&vpshufd	($D2,$D2,0b10001101);
-	&vpshufd	($D3,$D3,0b10001101);
-	&vpshufd	($D4,$D4,0b10001101);
-
-	&vmovdqu	(&QWP(16*0,"edi"),$D0);		# save the table
-	&vmovdqu	(&QWP(16*1,"edi"),$D1);
-	&vmovdqu	(&QWP(16*2,"edi"),$D2);
-	&vmovdqu	(&QWP(16*3,"edi"),$D3);
-	&vmovdqu	(&QWP(16*4,"edi"),$D4);
-
-	&vpslld		($T1,$D1,2);
-	&vpslld		($T0,$D2,2);
-	&vpaddd		($T1,$T1,$D1);			# *5
-	&vpaddd		($T0,$T0,$D2);			# *5
-	&vmovdqu	(&QWP(16*5,"edi"),$T1);
-	&vmovdqu	(&QWP(16*6,"edi"),$T0);
-	&vpslld		($T1,$D3,2);
-	&vpslld		($T0,$D4,2);
-	&vpaddd		($T1,$T1,$D3);			# *5
-	&vpaddd		($T0,$T0,$D4);			# *5
-	&vmovdqu	(&QWP(16*7,"edi"),$T1);
-	&vmovdqu	(&QWP(16*8,"edi"),$T0);
-
-	&mov		("esp","ebp");
-	&lea		("edi",&DWP(-16*3,"edi"));	# size de-optimization
-	&ret		();
-&function_end_B("_poly1305_init_avx2");
-
-########################################################################
-# now it's time to switch to %ymm
-
-my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7));
-my $MASK=$T2;
-
-sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
-
-&align	(32);
-&function_begin("_poly1305_blocks_avx2");
-	&mov	("edi",&wparam(0));			# ctx
-	&mov	("esi",&wparam(1));			# inp
-	&mov	("ecx",&wparam(2));			# len
-
-	&mov	("eax",&DWP(4*5,"edi"));		# is_base2_26
-	&and	("ecx",-16);
-	&jz	(&label("nodata"));
-	&cmp	("ecx",64);
-	&jae	(&label("enter_avx2"));
-	&test	("eax","eax");				# is_base2_26?
-	&jz	(&label("enter_blocks"));
-
-&set_label("enter_avx2");
-	&vzeroupper	();
-
-	&call	(&label("pic_point"));
-&set_label("pic_point");
-	&blindpop("ebx");
-	&lea	("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx"));
-
-	&test	("eax","eax");				# is_base2_26?
-	&jnz	(&label("base2_26"));
-
-	&call	("_poly1305_init_avx2");
-
-	################################################# base 2^32 -> base 2^26
-	&mov	("eax",&DWP(0,"edi"));
-	&mov	("ecx",&DWP(3,"edi"));
-	&mov	("edx",&DWP(6,"edi"));
-	&mov	("esi",&DWP(9,"edi"));
-	&mov	("ebp",&DWP(13,"edi"));
-
-	&shr	("ecx",2);
-	&and	("eax",0x3ffffff);
-	&shr	("edx",4);
-	&and	("ecx",0x3ffffff);
-	&shr	("esi",6);
-	&and	("edx",0x3ffffff);
-
-	&mov	(&DWP(4*0,"edi"),"eax");
-	&mov	(&DWP(4*1,"edi"),"ecx");
-	&mov	(&DWP(4*2,"edi"),"edx");
-	&mov	(&DWP(4*3,"edi"),"esi");
-	&mov	(&DWP(4*4,"edi"),"ebp");
-	&mov	(&DWP(4*5,"edi"),1);			# is_base2_26
-
-	&mov	("esi",&wparam(1));			# [reload] inp
-	&mov	("ecx",&wparam(2));			# [reload] len
-
-&set_label("base2_26");
-	&mov	("eax",&wparam(3));			# padbit
-	&mov	("ebp","esp");
-
-	&sub	("esp",32*(5+9));
-	&and	("esp",-512);				# ensure that frame
-							# doesn't cross page
-							# boundary, which is
-							# essential for
-							# misaligned 32-byte
-							# loads
-
-	################################################################
-        # expand and copy pre-calculated table to stack
-
-	&vmovdqu	(&X($D0),&QWP(16*(3+0),"edi"));
-	&lea		("edx",&DWP(32*5+128,"esp"));	# +128 size optimization
-	&vmovdqu	(&X($D1),&QWP(16*(3+1),"edi"));
-	&vmovdqu	(&X($D2),&QWP(16*(3+2),"edi"));
-	&vmovdqu	(&X($D3),&QWP(16*(3+3),"edi"));
-	&vmovdqu	(&X($D4),&QWP(16*(3+4),"edi"));
-	&lea		("edi",&DWP(16*3,"edi"));	# size optimization
-	&vpermq		($D0,$D0,0b01000000);		# 00001234 -> 12343434
-	&vpermq		($D1,$D1,0b01000000);
-	&vpermq		($D2,$D2,0b01000000);
-	&vpermq		($D3,$D3,0b01000000);
-	&vpermq		($D4,$D4,0b01000000);
-	&vpshufd	($D0,$D0,0b11001000);		# 12343434 -> 14243444
-	&vpshufd	($D1,$D1,0b11001000);
-	&vpshufd	($D2,$D2,0b11001000);
-	&vpshufd	($D3,$D3,0b11001000);
-	&vpshufd	($D4,$D4,0b11001000);
-	&vmovdqa	(&QWP(32*0-128,"edx"),$D0);
-	&vmovdqu	(&X($D0),&QWP(16*5,"edi"));
-	&vmovdqa	(&QWP(32*1-128,"edx"),$D1);
-	&vmovdqu	(&X($D1),&QWP(16*6,"edi"));
-	&vmovdqa	(&QWP(32*2-128,"edx"),$D2);
-	&vmovdqu	(&X($D2),&QWP(16*7,"edi"));
-	&vmovdqa	(&QWP(32*3-128,"edx"),$D3);
-	&vmovdqu	(&X($D3),&QWP(16*8,"edi"));
-	&vmovdqa	(&QWP(32*4-128,"edx"),$D4);
-	&vpermq		($D0,$D0,0b01000000);
-	&vpermq		($D1,$D1,0b01000000);
-	&vpermq		($D2,$D2,0b01000000);
-	&vpermq		($D3,$D3,0b01000000);
-	&vpshufd	($D0,$D0,0b11001000);
-	&vpshufd	($D1,$D1,0b11001000);
-	&vpshufd	($D2,$D2,0b11001000);
-	&vpshufd	($D3,$D3,0b11001000);
-	&vmovdqa	(&QWP(32*5-128,"edx"),$D0);
-	&vmovd		(&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value
-	&vmovdqa	(&QWP(32*6-128,"edx"),$D1);
-	&vmovd		(&X($D1),&DWP(-16*3+4*1,"edi"));
-	&vmovdqa	(&QWP(32*7-128,"edx"),$D2);
-	&vmovd		(&X($D2),&DWP(-16*3+4*2,"edi"));
-	&vmovdqa	(&QWP(32*8-128,"edx"),$D3);
-	&vmovd		(&X($D3),&DWP(-16*3+4*3,"edi"));
-	&vmovd		(&X($D4),&DWP(-16*3+4*4,"edi"));
-	&vmovdqa	($MASK,&QWP(64,"ebx"));
-	&neg		("eax");			# padbit
-
-	&test		("ecx",63);
-	&jz		(&label("even"));
-
-	&mov		("edx","ecx");
-	&and		("ecx",-64);
-	&and		("edx",63);
-
-	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));
-	&cmp		("edx",32);
-	&jb		(&label("one"));
-
-	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
-	&je		(&label("two"));
-
-	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
-	&lea		("esi",&DWP(16*3,"esi"));
-	&lea		("ebx",&DWP(8,"ebx"));		# three padbits
-	&lea		("edx",&DWP(32*5+128+8,"esp"));	# --:r^1:r^2:r^3 (*)
-	&jmp		(&label("tail"));
-
-&set_label("two");
-	&lea		("esi",&DWP(16*2,"esi"));
-	&lea		("ebx",&DWP(16,"ebx"));		# two padbits
-	&lea		("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*)
-	&jmp		(&label("tail"));
-
-&set_label("one");
-	&lea		("esi",&DWP(16*1,"esi"));
-	&vpxor		($T1,$T1,$T1);
-	&lea		("ebx",&DWP(32,"ebx","eax",8));	# one or no padbits
-	&lea		("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*)
-	&jmp		(&label("tail"));
-
-# (*)	spots marked with '--' are data from next table entry, but they
-#	are multiplied by 0 and therefore rendered insignificant
-
-&set_label("even",32);
-	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
-	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
-	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
-	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
-	&lea		("esi",&DWP(16*4,"esi"));
-	&sub		("ecx",64);
-	&jz		(&label("tail"));
-
-&set_label("loop");
-	################################################################
-	# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
-	# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
-	# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
-	# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
-	#   \________/ \_______/
-	################################################################
-
-sub vsplat_input {
-	&vmovdqa	(&QWP(32*2,"esp"),$D2);
-	&vpsrldq	($D2,$T0,6);			# splat input
-	&vmovdqa	(&QWP(32*0,"esp"),$D0);
-	&vpsrldq	($D0,$T1,6);
-	&vmovdqa	(&QWP(32*1,"esp"),$D1);
-	&vpunpckhqdq	($D1,$T0,$T1);			# 4
-	&vpunpcklqdq	($T0,$T0,$T1);			# 0:1
-	&vpunpcklqdq	($D2,$D2,$D0);			# 2:3
-
-	&vpsrlq		($D0,$D2,30);
-	&vpsrlq		($D2,$D2,4);
-	&vpsrlq		($T1,$T0,26);
-	&vpsrlq		($D1,$D1,40);			# 4
-	&vpand		($D2,$D2,$MASK);		# 2
-	&vpand		($T0,$T0,$MASK);		# 0
-	&vpand		($T1,$T1,$MASK);		# 1
-	&vpand		($D0,$D0,$MASK);		# 3 (*)
-	&vpor		($D1,$D1,&QWP(0,"ebx"));	# padbit, yes, always
-
-	# (*)	note that output is counterintuitive, inp[3:4] is
-	#	returned in $D1-2, while $D3-4 are preserved;
-}
-	&vsplat_input	();
-
-sub vpmuladd {
-my $addr = shift;
-
-	&vpaddq		($D2,$D2,&QWP(32*2,"esp"));	# add hash value
-	&vpaddq		($T0,$T0,&QWP(32*0,"esp"));
-	&vpaddq		($T1,$T1,&QWP(32*1,"esp"));
-	&vpaddq		($D0,$D0,$D3);
-	&vpaddq		($D1,$D1,$D4);
-
-	################################################################
-	# d3 = h2*r1   + h0*r3 + h1*r2   + h3*r0   + h4*5*r4
-	# d4 = h2*r2   + h0*r4 + h1*r3   + h3*r1   + h4*r0
-	# d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
-	# d1 = h2*5*r4 + h0*r1 + h1*r0   + h3*5*r3 + h4*5*r2
-	# d2 = h2*r0   + h0*r2 + h1*r1   + h3*5*r4 + h4*5*r3
-
-	&vpmuludq	($D3,$D2,&$addr(1));		# d3 = h2*r1
-	 &vmovdqa	(QWP(32*1,"esp"),$T1);
-	&vpmuludq	($D4,$D2,&$addr(2));		# d4 = h2*r2
-	 &vmovdqa	(QWP(32*3,"esp"),$D0);
-	&vpmuludq	($D0,$D2,&$addr(7));		# d0 = h2*s3
-	 &vmovdqa	(QWP(32*4,"esp"),$D1);
-	&vpmuludq	($D1,$D2,&$addr(8));		# d1 = h2*s4
-	&vpmuludq	($D2,$D2,&$addr(0));		# d2 = h2*r0
-
-	&vpmuludq	($T2,$T0,&$addr(3));		# h0*r3
-	&vpaddq		($D3,$D3,$T2);			# d3 += h0*r3
-	&vpmuludq	($T1,$T0,&$addr(4));		# h0*r4
-	&vpaddq		($D4,$D4,$T1);			# d4 + h0*r4
-	&vpmuludq	($T2,$T0,&$addr(0));		# h0*r0
-	&vpaddq		($D0,$D0,$T2);			# d0 + h0*r0
-	 &vmovdqa	($T2,&QWP(32*1,"esp"));		# h1
-	&vpmuludq	($T1,$T0,&$addr(1));		# h0*r1
-	&vpaddq		($D1,$D1,$T1);			# d1 += h0*r1
-	&vpmuludq	($T0,$T0,&$addr(2));		# h0*r2
-	&vpaddq		($D2,$D2,$T0);			# d2 += h0*r2
-
-	&vpmuludq	($T1,$T2,&$addr(2));		# h1*r2
-	&vpaddq		($D3,$D3,$T1);			# d3 += h1*r2
-	&vpmuludq	($T0,$T2,&$addr(3));		# h1*r3
-	&vpaddq		($D4,$D4,$T0);			# d4 += h1*r3
-	&vpmuludq	($T1,$T2,&$addr(8));		# h1*s4
-	&vpaddq		($D0,$D0,$T1);			# d0 += h1*s4
-	 &vmovdqa	($T1,&QWP(32*3,"esp"));		# h3
-	&vpmuludq	($T0,$T2,&$addr(0));		# h1*r0
-	&vpaddq		($D1,$D1,$T0);			# d1 += h1*r0
-	&vpmuludq	($T2,$T2,&$addr(1));		# h1*r1
-	&vpaddq		($D2,$D2,$T2);			# d2 += h1*r1
-
-	&vpmuludq	($T0,$T1,&$addr(0));		# h3*r0
-	&vpaddq		($D3,$D3,$T0);			# d3 += h3*r0
-	&vpmuludq	($T2,$T1,&$addr(1));		# h3*r1
-	&vpaddq		($D4,$D4,$T2);			# d4 += h3*r1
-	&vpmuludq	($T0,$T1,&$addr(6));		# h3*s2
-	&vpaddq		($D0,$D0,$T0);			# d0 += h3*s2
-	 &vmovdqa	($T0,&QWP(32*4,"esp"));		# h4
-	&vpmuludq	($T2,$T1,&$addr(7));		# h3*s3
-	&vpaddq		($D1,$D1,$T2);			# d1+= h3*s3
-	&vpmuludq	($T1,$T1,&$addr(8));		# h3*s4
-	&vpaddq		($D2,$D2,$T1);			# d2 += h3*s4
-
-	&vpmuludq	($T2,$T0,&$addr(8));		# h4*s4
-	&vpaddq		($D3,$D3,$T2);			# d3 += h4*s4
-	&vpmuludq	($T1,$T0,&$addr(5));		# h4*s1
-	&vpaddq		($D0,$D0,$T1);			# d0 += h4*s1
-	&vpmuludq	($T2,$T0,&$addr(0));		# h4*r0
-	&vpaddq		($D4,$D4,$T2);			# d4 += h4*r0
-	 &vmovdqa	($MASK,&QWP(64,"ebx"));
-	&vpmuludq	($T1,$T0,&$addr(6));		# h4*s2
-	&vpaddq		($D1,$D1,$T1);			# d1 += h4*s2
-	&vpmuludq	($T0,$T0,&$addr(7));		# h4*s3
-	&vpaddq		($D2,$D2,$T0);			# d2 += h4*s3
-}
-	&vpmuladd	(sub {	my $i=shift; &QWP(32*$i-128,"edx");	});
-
-sub vlazy_reduction {
-	################################################################
-	# lazy reduction
-
-	 &vpsrlq	($T0,$D3,26);
-	 &vpand		($D3,$D3,$MASK);
-	&vpsrlq		($T1,$D0,26);
-	&vpand		($D0,$D0,$MASK);
-	 &vpaddq	($D4,$D4,$T0);			# h3 -> h4
-	&vpaddq		($D1,$D1,$T1);			# h0 -> h1
-	 &vpsrlq	($T0,$D4,26);
-	 &vpand		($D4,$D4,$MASK);
-	&vpsrlq		($T1,$D1,26);
-	&vpand		($D1,$D1,$MASK);
-	&vpaddq		($D2,$D2,$T1);			# h1 -> h2
-	 &vpaddq	($D0,$D0,$T0);
-	 &vpsllq	($T0,$T0,2);
-	&vpsrlq		($T1,$D2,26);
-	&vpand		($D2,$D2,$MASK);
-	 &vpaddq	($D0,$D0,$T0);			# h4 -> h0
-	&vpaddq		($D3,$D3,$T1);			# h2 -> h3
-	&vpsrlq		($T1,$D3,26);
-	 &vpsrlq	($T0,$D0,26);
-	 &vpand		($D0,$D0,$MASK);
-	&vpand		($D3,$D3,$MASK);
-	 &vpaddq	($D1,$D1,$T0);			# h0 -> h1
-	&vpaddq		($D4,$D4,$T1);			# h3 -> h4
-}
-	&vlazy_reduction();
-
-	&vmovdqu	(&X($T0),&QWP(16*0,"esi"));	# load input
-	&vmovdqu	(&X($T1),&QWP(16*1,"esi"));
-	&vinserti128	($T0,$T0,&QWP(16*2,"esi"),1);
-	&vinserti128	($T1,$T1,&QWP(16*3,"esi"),1);
-	&lea		("esi",&DWP(16*4,"esi"));
-	&sub		("ecx",64);
-	&jnz		(&label("loop"));
-
-&set_label("tail");
-	&vsplat_input	();
-	&and		("ebx",-64);			# restore pointer
-
-	&vpmuladd	(sub {	my $i=shift; &QWP(4+32*$i-128,"edx");	});
-
-	################################################################
-	# horizontal addition
-
-	&vpsrldq	($T0,$D4,8);
-	&vpsrldq	($T1,$D3,8);
-	&vpaddq		($D4,$D4,$T0);
-	&vpsrldq	($T0,$D0,8);
-	&vpaddq		($D3,$D3,$T1);
-	&vpsrldq	($T1,$D1,8);
-	&vpaddq		($D0,$D0,$T0);
-	&vpsrldq	($T0,$D2,8);
-	&vpaddq		($D1,$D1,$T1);
-	&vpermq		($T1,$D4,2);			# keep folding
-	&vpaddq		($D2,$D2,$T0);
-	&vpermq		($T0,$D3,2);
-	&vpaddq		($D4,$D4,$T1);
-	&vpermq		($T1,$D0,2);
-	&vpaddq		($D3,$D3,$T0);
-	&vpermq		($T0,$D1,2);
-	&vpaddq		($D0,$D0,$T1);
-	&vpermq		($T1,$D2,2);
-	&vpaddq		($D1,$D1,$T0);
-	&vpaddq		($D2,$D2,$T1);
-
-	&vlazy_reduction();
-
-	&cmp		("ecx",0);
-	&je		(&label("done"));
-
-	################################################################
-	# clear all but single word
-
-	&vpshufd	(&X($D0),&X($D0),0b11111100);
-	&lea		("edx",&DWP(32*5+128,"esp"));	# restore pointer
-	&vpshufd	(&X($D1),&X($D1),0b11111100);
-	&vpshufd	(&X($D2),&X($D2),0b11111100);
-	&vpshufd	(&X($D3),&X($D3),0b11111100);
-	&vpshufd	(&X($D4),&X($D4),0b11111100);
-	&jmp		(&label("even"));
-
-&set_label("done",16);
-	&vmovd		(&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
-	&vmovd		(&DWP(-16*3+4*1,"edi"),&X($D1));
-	&vmovd		(&DWP(-16*3+4*2,"edi"),&X($D2));
-	&vmovd		(&DWP(-16*3+4*3,"edi"),&X($D3));
-	&vmovd		(&DWP(-16*3+4*4,"edi"),&X($D4));
-	&vzeroupper	();
-	&mov	("esp","ebp");
-&set_label("nodata");
-&function_end("_poly1305_blocks_avx2");
-}
-&set_label("const_sse2",64);
-	&data_word(1<<24,0,	1<<24,0,	1<<24,0,	1<<24,0);
-	&data_word(0,0,		0,0,		0,0,		0,0);
-	&data_word(0x03ffffff,0,0x03ffffff,0,	0x03ffffff,0,	0x03ffffff,0);
-	&data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc);
-}
-&asciz	("Poly1305 for x86, CRYPTOGAMS by <appro\@openssl.org>");
-&align	(4);
-
-&asm_finish();
-
-close STDOUT;
diff --git a/src/crypto/poly1305/asm/poly1305-x86_64.pl b/src/crypto/poly1305/asm/poly1305-x86_64.pl
deleted file mode 100755
index 3630b47..0000000
--- a/src/crypto/poly1305/asm/poly1305-x86_64.pl
+++ /dev/null
@@ -1,2235 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# This module implements Poly1305 hash for x86_64.
-#
-# March 2015
-#
-# Numbers are cycles per processed byte with poly1305_blocks alone,
-# measured with rdtsc at fixed clock frequency.
-#
-#		IALU/gcc-4.8(*)	AVX(**)		AVX2
-# P4		4.90/+120%      -
-# Core 2	2.39/+90%	-
-# Westmere	1.86/+120%	-
-# Sandy Bridge	1.39/+140%	1.10
-# Haswell	1.10/+175%	1.11		0.65
-# Skylake	1.12/+120%	0.96		0.51
-# Silvermont	2.83/+95%	-
-# VIA Nano	1.82/+150%	-
-# Sledgehammer	1.38/+160%	-
-# Bulldozer	2.21/+130%	0.97
-#
-# (*)	improvement coefficients relative to clang are more modest and
-#	are ~50% on most processors, in both cases we are comparing to
-#	__int128 code;
-# (**)	SSE2 implementation was attempted, but among non-AVX processors
-#	it was faster than integer-only code only on older Intel P4 and
-#	Core processors, 50-30%, less newer processor is, but slower on
-#	contemporary ones, for example almost 2x slower on Atom, and as
-#	former are naturally disappearing, SSE2 is deemed unnecessary;
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-$avx = 2;
-
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
-
-my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
-my ($mac,$nonce)=($inp,$len);	# *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
-
-sub poly1305_iteration {
-# input:	copy of $r1 in %rax, $h0-$h2, $r0-$r1
-# output:	$h0-$h2 *= $r0-$r1
-$code.=<<___;
-	mulq	$h0			# h0*r1
-	mov	%rax,$d2
-	 mov	$r0,%rax
-	mov	%rdx,$d3
-
-	mulq	$h0			# h0*r0
-	mov	%rax,$h0		# future $h0
-	 mov	$r0,%rax
-	mov	%rdx,$d1
-
-	mulq	$h1			# h1*r0
-	add	%rax,$d2
-	 mov	$s1,%rax
-	adc	%rdx,$d3
-
-	mulq	$h1			# h1*s1
-	 mov	$h2,$h1			# borrow $h1
-	add	%rax,$h0
-	adc	%rdx,$d1
-
-	imulq	$s1,$h1			# h2*s1
-	add	$h1,$d2
-	 mov	$d1,$h1
-	adc	\$0,$d3
-
-	imulq	$r0,$h2			# h2*r0
-	add	$d2,$h1
-	mov	\$-4,%rax		# mask value
-	adc	$h2,$d3
-
-	and	$d3,%rax		# last reduction step
-	mov	$d3,$h2
-	shr	\$2,$d3
-	and	\$3,$h2
-	add	$d3,%rax
-	add	%rax,$h0
-	adc	\$0,$h1
-___
-}
-
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int64 h[3];		# current hash value base 2^64
-#	unsigned __int64 r[2];		# key value base 2^64
-
-$code.=<<___;
-.text
-
-.extern	OPENSSL_ia32cap_P
-
-.globl	poly1305_init
-.globl	poly1305_blocks
-.globl	poly1305_emit
-.type	poly1305_init,\@function,3
-.align	32
-poly1305_init:
-	xor	%rax,%rax
-	mov	%rax,0($ctx)		# initialize hash value
-	mov	%rax,8($ctx)
-	mov	%rax,16($ctx)
-
-	cmp	\$0,$inp
-	je	.Lno_key
-
-	lea	poly1305_blocks(%rip),%r10
-	lea	poly1305_emit(%rip),%r11
-___
-$code.=<<___	if ($avx);
-	mov	OPENSSL_ia32cap_P+4(%rip),%r9
-	lea	poly1305_blocks_avx(%rip),%rax
-	lea	poly1305_emit_avx(%rip),%rcx
-	bt	\$`60-32`,%r9		# AVX?
-	cmovc	%rax,%r10
-	cmovc	%rcx,%r11
-___
-$code.=<<___	if ($avx>1);
-	lea	poly1305_blocks_avx2(%rip),%rax
-	bt	\$`5+32`,%r9		# AVX2?
-	cmovc	%rax,%r10
-___
-$code.=<<___;
-	mov	\$0x0ffffffc0fffffff,%rax
-	mov	\$0x0ffffffc0ffffffc,%rcx
-	and	0($inp),%rax
-	and	8($inp),%rcx
-	mov	%rax,24($ctx)
-	mov	%rcx,32($ctx)
-___
-$code.=<<___	if ($flavour !~ /elf32/);
-	mov	%r10,0(%rdx)
-	mov	%r11,8(%rdx)
-___
-$code.=<<___	if ($flavour =~ /elf32/);
-	mov	%r10d,0(%rdx)
-	mov	%r11d,4(%rdx)
-___
-$code.=<<___;
-	mov	\$1,%eax
-.Lno_key:
-	ret
-.size	poly1305_init,.-poly1305_init
-
-.type	poly1305_blocks,\@function,4
-.align	32
-poly1305_blocks:
-.Lblocks:
-	sub	\$16,$len		# too short?
-	jc	.Lno_data
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lblocks_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	mov	0($ctx),$h0		# load hash value
-	mov	8($ctx),$h1
-	mov	16($ctx),$h2
-
-	mov	$s1,$r1
-	shr	\$2,$s1
-	mov	$r1,%rax
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-	jmp	.Loop
-
-.align	32
-.Loop:
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-___
-	&poly1305_iteration();
-$code.=<<___;
-	mov	$r1,%rax
-	sub	\$16,%r15		# len-=16
-	jnc	.Loop
-
-	mov	$h0,0($ctx)		# store hash value
-	mov	$h1,8($ctx)
-	mov	$h2,16($ctx)
-
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rsp
-.Lno_data:
-.Lblocks_epilogue:
-	ret
-.size	poly1305_blocks,.-poly1305_blocks
-
-.type	poly1305_emit,\@function,3
-.align	32
-poly1305_emit:
-.Lemit:
-	mov	0($ctx),%r8	# load hash value
-	mov	8($ctx),%r9
-	mov	16($ctx),%r10
-
-	mov	%r8,%rax
-	add	\$5,%r8		# compare to modulus
-	mov	%r9,%rcx
-	adc	\$0,%r9
-	adc	\$0,%r10
-	shr	\$2,%r10	# did 130-bit value overfow?
-	cmovnz	%r8,%rax
-	cmovnz	%r9,%rcx
-
-	add	0($nonce),%rax	# accumulate nonce
-	adc	8($nonce),%rcx
-	mov	%rax,0($mac)	# write result
-	mov	%rcx,8($mac)
-
-	ret
-.size	poly1305_emit,.-poly1305_emit
-___
-if ($avx) {
-
-########################################################################
-# Layout of opaque area is following.
-#
-#	unsigned __int32 h[5];		# current hash value base 2^26
-#	unsigned __int32 is_base2_26;
-#	unsigned __int64 r[2];		# key value base 2^64
-#	unsigned __int64 pad;
-#	struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
-#
-# where r^n are base 2^26 digits of degrees of multiplier key. There are
-# 5 digits, but last four are interleaved with multiples of 5, totalling
-# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
-
-my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
-    map("%xmm$_",(0..15));
-
-$code.=<<___;
-.type	__poly1305_block,\@abi-omnipotent
-.align	32
-__poly1305_block:
-___
-	&poly1305_iteration();
-$code.=<<___;
-	ret
-.size	__poly1305_block,.-__poly1305_block
-
-.type	__poly1305_init_avx,\@abi-omnipotent
-.align	32
-__poly1305_init_avx:
-	mov	$r0,$h0
-	mov	$r1,$h1
-	xor	$h2,$h2
-
-	lea	48+64($ctx),$ctx	# size optimization
-
-	mov	$r1,%rax
-	call	__poly1305_block	# r^2
-
-	mov	\$0x3ffffff,%eax	# save interleaved r^2 and r base 2^26
-	mov	\$0x3ffffff,%edx
-	mov	$h0,$d1
-	and	$h0#d,%eax
-	mov	$r0,$d2
-	and	$r0#d,%edx
-	mov	%eax,`16*0+0-64`($ctx)
-	shr	\$26,$d1
-	mov	%edx,`16*0+4-64`($ctx)
-	shr	\$26,$d2
-
-	mov	\$0x3ffffff,%eax
-	mov	\$0x3ffffff,%edx
-	and	$d1#d,%eax
-	and	$d2#d,%edx
-	mov	%eax,`16*1+0-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	%edx,`16*1+4-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	mov	%eax,`16*2+0-64`($ctx)
-	shr	\$26,$d1
-	mov	%edx,`16*2+4-64`($ctx)
-	shr	\$26,$d2
-
-	mov	$h1,%rax
-	mov	$r1,%rdx
-	shl	\$12,%rax
-	shl	\$12,%rdx
-	or	$d1,%rax
-	or	$d2,%rdx
-	and	\$0x3ffffff,%eax
-	and	\$0x3ffffff,%edx
-	mov	%eax,`16*3+0-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	%edx,`16*3+4-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	mov	%eax,`16*4+0-64`($ctx)
-	mov	$h1,$d1
-	mov	%edx,`16*4+4-64`($ctx)
-	mov	$r1,$d2
-
-	mov	\$0x3ffffff,%eax
-	mov	\$0x3ffffff,%edx
-	shr	\$14,$d1
-	shr	\$14,$d2
-	and	$d1#d,%eax
-	and	$d2#d,%edx
-	mov	%eax,`16*5+0-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	%edx,`16*5+4-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	mov	%eax,`16*6+0-64`($ctx)
-	shr	\$26,$d1
-	mov	%edx,`16*6+4-64`($ctx)
-	shr	\$26,$d2
-
-	mov	$h2,%rax
-	shl	\$24,%rax
-	or	%rax,$d1
-	mov	$d1#d,`16*7+0-64`($ctx)
-	lea	($d1,$d1,4),$d1		# *5
-	mov	$d2#d,`16*7+4-64`($ctx)
-	lea	($d2,$d2,4),$d2		# *5
-	mov	$d1#d,`16*8+0-64`($ctx)
-	mov	$d2#d,`16*8+4-64`($ctx)
-
-	mov	$r1,%rax
-	call	__poly1305_block	# r^3
-
-	mov	\$0x3ffffff,%eax	# save r^3 base 2^26
-	mov	$h0,$d1
-	and	$h0#d,%eax
-	shr	\$26,$d1
-	mov	%eax,`16*0+12-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	and	$d1#d,%edx
-	mov	%edx,`16*1+12-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*2+12-64`($ctx)
-
-	mov	$h1,%rax
-	shl	\$12,%rax
-	or	$d1,%rax
-	and	\$0x3ffffff,%eax
-	mov	%eax,`16*3+12-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	$h1,$d1
-	mov	%eax,`16*4+12-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	shr	\$14,$d1
-	and	$d1#d,%edx
-	mov	%edx,`16*5+12-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*6+12-64`($ctx)
-
-	mov	$h2,%rax
-	shl	\$24,%rax
-	or	%rax,$d1
-	mov	$d1#d,`16*7+12-64`($ctx)
-	lea	($d1,$d1,4),$d1		# *5
-	mov	$d1#d,`16*8+12-64`($ctx)
-
-	mov	$r1,%rax
-	call	__poly1305_block	# r^4
-
-	mov	\$0x3ffffff,%eax	# save r^4 base 2^26
-	mov	$h0,$d1
-	and	$h0#d,%eax
-	shr	\$26,$d1
-	mov	%eax,`16*0+8-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	and	$d1#d,%edx
-	mov	%edx,`16*1+8-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*2+8-64`($ctx)
-
-	mov	$h1,%rax
-	shl	\$12,%rax
-	or	$d1,%rax
-	and	\$0x3ffffff,%eax
-	mov	%eax,`16*3+8-64`($ctx)
-	lea	(%rax,%rax,4),%eax	# *5
-	mov	$h1,$d1
-	mov	%eax,`16*4+8-64`($ctx)
-
-	mov	\$0x3ffffff,%edx
-	shr	\$14,$d1
-	and	$d1#d,%edx
-	mov	%edx,`16*5+8-64`($ctx)
-	lea	(%rdx,%rdx,4),%edx	# *5
-	shr	\$26,$d1
-	mov	%edx,`16*6+8-64`($ctx)
-
-	mov	$h2,%rax
-	shl	\$24,%rax
-	or	%rax,$d1
-	mov	$d1#d,`16*7+8-64`($ctx)
-	lea	($d1,$d1,4),$d1		# *5
-	mov	$d1#d,`16*8+8-64`($ctx)
-
-	lea	-48-64($ctx),$ctx	# size [de-]optimization
-	ret
-.size	__poly1305_init_avx,.-__poly1305_init_avx
-
-.type	poly1305_blocks_avx,\@function,4
-.align	32
-poly1305_blocks_avx:
-	mov	20($ctx),%r8d		# is_base2_26
-	cmp	\$128,$len
-	jae	.Lblocks_avx
-	test	%r8d,%r8d
-	jz	.Lblocks
-
-.Lblocks_avx:
-	and	\$-16,$len
-	jz	.Lno_data_avx
-
-	vzeroupper
-
-	test	%r8d,%r8d
-	jz	.Lbase2_64_avx
-
-	test	\$31,$len
-	jz	.Leven_avx
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lblocks_avx_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	0($ctx),$d1		# load hash value
-	mov	8($ctx),$d2
-	mov	16($ctx),$h2#d
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	################################# base 2^26 -> base 2^64
-	mov	$d1#d,$h0#d
-	and	\$-1<<31,$d1
-	mov	$d2,$r1			# borrow $r1
-	mov	$d2#d,$h1#d
-	and	\$-1<<31,$d2
-
-	shr	\$6,$d1
-	shl	\$52,$r1
-	add	$d1,$h0
-	shr	\$12,$h1
-	shr	\$18,$d2
-	add	$r1,$h0
-	adc	$d2,$h1
-
-	mov	$h2,$d1
-	shl	\$40,$d1
-	shr	\$24,$h2
-	add	$d1,$h1
-	adc	\$0,$h2			# can be partially reduced...
-
-	mov	\$-4,$d2		# ... so reduce
-	mov	$h2,$d1
-	and	$h2,$d2
-	shr	\$2,$d1
-	and	\$3,$h2
-	add	$d2,$d1			# =*5
-	add	$d1,$h0
-	adc	\$0,$h1
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-
-	call	__poly1305_block
-
-	test	$padbit,$padbit		# if $padbit is zero,
-	jz	.Lstore_base2_64_avx	# store hash in base 2^64 format
-
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$r0
-	mov	$h1,$r1
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$r0
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$r0,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$r1
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$r1,$h2			# h[4]
-
-	sub	\$16,%r15
-	jz	.Lstore_base2_26_avx
-
-	vmovd	%rax#d,$H0
-	vmovd	%rdx#d,$H1
-	vmovd	$h0#d,$H2
-	vmovd	$h1#d,$H3
-	vmovd	$h2#d,$H4
-	jmp	.Lproceed_avx
-
-.align	32
-.Lstore_base2_64_avx:
-	mov	$h0,0($ctx)
-	mov	$h1,8($ctx)
-	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
-	jmp	.Ldone_avx
-
-.align	16
-.Lstore_base2_26_avx:
-	mov	%rax#d,0($ctx)		# store hash value base 2^26
-	mov	%rdx#d,4($ctx)
-	mov	$h0#d,8($ctx)
-	mov	$h1#d,12($ctx)
-	mov	$h2#d,16($ctx)
-.align	16
-.Ldone_avx:
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rsp
-.Lno_data_avx:
-.Lblocks_avx_epilogue:
-	ret
-
-.align	32
-.Lbase2_64_avx:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lbase2_64_avx_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	mov	0($ctx),$h0		# load hash value
-	mov	8($ctx),$h1
-	mov	16($ctx),$h2#d
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-	test	\$31,$len
-	jz	.Linit_avx
-
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-	sub	\$16,%r15
-
-	call	__poly1305_block
-
-.Linit_avx:
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$d1
-	mov	$h1,$d2
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$d1
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$d1,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$d2
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$d2,$h2			# h[4]
-
-	vmovd	%rax#d,$H0
-	vmovd	%rdx#d,$H1
-	vmovd	$h0#d,$H2
-	vmovd	$h1#d,$H3
-	vmovd	$h2#d,$H4
-	movl	\$1,20($ctx)		# set is_base2_26
-
-	call	__poly1305_init_avx
-
-.Lproceed_avx:
-	mov	%r15,$len
-
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rax
-	lea	48(%rsp),%rsp
-.Lbase2_64_avx_epilogue:
-	jmp	.Ldo_avx
-
-.align	32
-.Leven_avx:
-	vmovd		4*0($ctx),$H0		# load hash value
-	vmovd		4*1($ctx),$H1
-	vmovd		4*2($ctx),$H2
-	vmovd		4*3($ctx),$H3
-	vmovd		4*4($ctx),$H4
-
-.Ldo_avx:
-___
-$code.=<<___	if (!$win64);
-	lea		-0x58(%rsp),%r11
-	sub		\$0x178,%rsp
-___
-$code.=<<___	if ($win64);
-	lea		-0xf8(%rsp),%r11
-	sub		\$0x218,%rsp
-	vmovdqa		%xmm6,0x50(%r11)
-	vmovdqa		%xmm7,0x60(%r11)
-	vmovdqa		%xmm8,0x70(%r11)
-	vmovdqa		%xmm9,0x80(%r11)
-	vmovdqa		%xmm10,0x90(%r11)
-	vmovdqa		%xmm11,0xa0(%r11)
-	vmovdqa		%xmm12,0xb0(%r11)
-	vmovdqa		%xmm13,0xc0(%r11)
-	vmovdqa		%xmm14,0xd0(%r11)
-	vmovdqa		%xmm15,0xe0(%r11)
-.Ldo_avx_body:
-___
-$code.=<<___;
-	sub		\$64,$len
-	lea		-32($inp),%rax
-	cmovc		%rax,$inp
-
-	vmovdqu		`16*3`($ctx),$D4	# preload r0^2
-	lea		`16*3+64`($ctx),$ctx	# size optimization
-	lea		.Lconst(%rip),%rcx
-
-	################################################################
-	# load input
-	vmovdqu		16*2($inp),$T0
-	vmovdqu		16*3($inp),$T1
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-
-	vpsrldq		\$6,$T0,$T2		# splat input
-	vpsrldq		\$6,$T1,$T3
-	vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	vpunpcklqdq	$T3,$T2,$T3		# 2:3
-
-	vpsrlq		\$40,$T4,$T4		# 4
-	vpsrlq		\$26,$T0,$T1
-	vpand		$MASK,$T0,$T0		# 0
-	vpsrlq		\$4,$T3,$T2
-	vpand		$MASK,$T1,$T1		# 1
-	vpsrlq		\$30,$T3,$T3
-	vpand		$MASK,$T2,$T2		# 2
-	vpand		$MASK,$T3,$T3		# 3
-	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	jbe		.Lskip_loop_avx
-
-	# expand and copy pre-calculated table to stack
-	vmovdqu		`16*1-64`($ctx),$D1
-	vmovdqu		`16*2-64`($ctx),$D2
-	vpshufd		\$0xEE,$D4,$D3		# 34xx -> 3434
-	vpshufd		\$0x44,$D4,$D0		# xx12 -> 1212
-	vmovdqa		$D3,-0x90(%r11)
-	vmovdqa		$D0,0x00(%rsp)
-	vpshufd		\$0xEE,$D1,$D4
-	vmovdqu		`16*3-64`($ctx),$D0
-	vpshufd		\$0x44,$D1,$D1
-	vmovdqa		$D4,-0x80(%r11)
-	vmovdqa		$D1,0x10(%rsp)
-	vpshufd		\$0xEE,$D2,$D3
-	vmovdqu		`16*4-64`($ctx),$D1
-	vpshufd		\$0x44,$D2,$D2
-	vmovdqa		$D3,-0x70(%r11)
-	vmovdqa		$D2,0x20(%rsp)
-	vpshufd		\$0xEE,$D0,$D4
-	vmovdqu		`16*5-64`($ctx),$D2
-	vpshufd		\$0x44,$D0,$D0
-	vmovdqa		$D4,-0x60(%r11)
-	vmovdqa		$D0,0x30(%rsp)
-	vpshufd		\$0xEE,$D1,$D3
-	vmovdqu		`16*6-64`($ctx),$D0
-	vpshufd		\$0x44,$D1,$D1
-	vmovdqa		$D3,-0x50(%r11)
-	vmovdqa		$D1,0x40(%rsp)
-	vpshufd		\$0xEE,$D2,$D4
-	vmovdqu		`16*7-64`($ctx),$D1
-	vpshufd		\$0x44,$D2,$D2
-	vmovdqa		$D4,-0x40(%r11)
-	vmovdqa		$D2,0x50(%rsp)
-	vpshufd		\$0xEE,$D0,$D3
-	vmovdqu		`16*8-64`($ctx),$D2
-	vpshufd		\$0x44,$D0,$D0
-	vmovdqa		$D3,-0x30(%r11)
-	vmovdqa		$D0,0x60(%rsp)
-	vpshufd		\$0xEE,$D1,$D4
-	vpshufd		\$0x44,$D1,$D1
-	vmovdqa		$D4,-0x20(%r11)
-	vmovdqa		$D1,0x70(%rsp)
-	vpshufd		\$0xEE,$D2,$D3
-	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
-	vpshufd		\$0x44,$D2,$D2
-	vmovdqa		$D3,-0x10(%r11)
-	vmovdqa		$D2,0x80(%rsp)
-
-	jmp		.Loop_avx
-
-.align	32
-.Loop_avx:
-	################################################################
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
-	#   \___________________/
-	# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
-	# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
-	#   \___________________/ \____________________/
-	#
-	# Note that we start with inp[2:3]*r^2. This is because it
-	# doesn't depend on reduction in previous iteration.
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	#
-	# though note that $Tx and $Hx are "reversed" in this section,
-	# and $D4 is preloaded with r0^2...
-
-	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
-	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
-	  vmovdqa	$H2,0x20(%r11)				# offload hash
-	vpmuludq	$T2,$D4,$D2		# d3 = h2*r0
-	 vmovdqa	0x10(%rsp),$H2		# r1^2
-	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
-	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
-
-	  vmovdqa	$H0,0x00(%r11)				#
-	vpmuludq	0x20(%rsp),$T4,$H0	# h4*s1
-	  vmovdqa	$H1,0x10(%r11)				#
-	vpmuludq	$T3,$H2,$H1		# h3*r1
-	vpaddq		$H0,$D0,$D0		# d0 += h4*s1
-	vpaddq		$H1,$D4,$D4		# d4 += h3*r1
-	  vmovdqa	$H3,0x30(%r11)				#
-	vpmuludq	$T2,$H2,$H0		# h2*r1
-	vpmuludq	$T1,$H2,$H1		# h1*r1
-	vpaddq		$H0,$D3,$D3		# d3 += h2*r1
-	 vmovdqa	0x30(%rsp),$H3		# r2^2
-	vpaddq		$H1,$D2,$D2		# d2 += h1*r1
-	  vmovdqa	$H4,0x40(%r11)				#
-	vpmuludq	$T0,$H2,$H2		# h0*r1
-	 vpmuludq	$T2,$H3,$H0		# h2*r2
-	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
-
-	 vmovdqa	0x40(%rsp),$H4		# s2^2
-	vpaddq		$H0,$D4,$D4		# d4 += h2*r2
-	vpmuludq	$T1,$H3,$H1		# h1*r2
-	vpmuludq	$T0,$H3,$H3		# h0*r2
-	vpaddq		$H1,$D3,$D3		# d3 += h1*r2
-	 vmovdqa	0x50(%rsp),$H2		# r3^2
-	vpaddq		$H3,$D2,$D2		# d2 += h0*r2
-	vpmuludq	$T4,$H4,$H0		# h4*s2
-	vpmuludq	$T3,$H4,$H4		# h3*s2
-	vpaddq		$H0,$D1,$D1		# d1 += h4*s2
-	 vmovdqa	0x60(%rsp),$H3		# s3^2
-	vpaddq		$H4,$D0,$D0		# d0 += h3*s2
-
-	 vmovdqa	0x80(%rsp),$H4		# s4^2
-	vpmuludq	$T1,$H2,$H1		# h1*r3
-	vpmuludq	$T0,$H2,$H2		# h0*r3
-	vpaddq		$H1,$D4,$D4		# d4 += h1*r3
-	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
-	vpmuludq	$T4,$H3,$H0		# h4*s3
-	vpmuludq	$T3,$H3,$H1		# h3*s3
-	vpaddq		$H0,$D2,$D2		# d2 += h4*s3
-	 vmovdqu	16*0($inp),$H0				# load input
-	vpaddq		$H1,$D1,$D1		# d1 += h3*s3
-	vpmuludq	$T2,$H3,$H3		# h2*s3
-	 vpmuludq	$T2,$H4,$T2		# h2*s4
-	vpaddq		$H3,$D0,$D0		# d0 += h2*s3
-
-	 vmovdqu	16*1($inp),$H1				#
-	vpaddq		$T2,$D1,$D1		# d1 += h2*s4
-	vpmuludq	$T3,$H4,$T3		# h3*s4
-	vpmuludq	$T4,$H4,$T4		# h4*s4
-	 vpsrldq	\$6,$H0,$H2				# splat input
-	vpaddq		$T3,$D2,$D2		# d2 += h3*s4
-	vpaddq		$T4,$D3,$D3		# d3 += h4*s4
-	 vpsrldq	\$6,$H1,$H3				#
-	vpmuludq	0x70(%rsp),$T0,$T4	# h0*r4
-	vpmuludq	$T1,$H4,$T0		# h1*s4
-	 vpunpckhqdq	$H1,$H0,$H4		# 4
-	vpaddq		$T4,$D4,$D4		# d4 += h0*r4
-	 vmovdqa	-0x90(%r11),$T4		# r0^4
-	vpaddq		$T0,$D0,$D0		# d0 += h1*s4
-
-	vpunpcklqdq	$H1,$H0,$H0		# 0:1
-	vpunpcklqdq	$H3,$H2,$H3		# 2:3
-
-	#vpsrlq		\$40,$H4,$H4		# 4
-	vpsrldq		\$`40/8`,$H4,$H4	# 4
-	vpsrlq		\$26,$H0,$H1
-	vpand		$MASK,$H0,$H0		# 0
-	vpsrlq		\$4,$H3,$H2
-	vpand		$MASK,$H1,$H1		# 1
-	vpand		0(%rcx),$H4,$H4		# .Lmask24
-	vpsrlq		\$30,$H3,$H3
-	vpand		$MASK,$H2,$H2		# 2
-	vpand		$MASK,$H3,$H3		# 3
-	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
-
-	vpaddq		0x00(%r11),$H0,$H0	# add hash value
-	vpaddq		0x10(%r11),$H1,$H1
-	vpaddq		0x20(%r11),$H2,$H2
-	vpaddq		0x30(%r11),$H3,$H3
-	vpaddq		0x40(%r11),$H4,$H4
-
-	lea		16*2($inp),%rax
-	lea		16*4($inp),$inp
-	sub		\$64,$len
-	cmovc		%rax,$inp
-
-	################################################################
-	# Now we accumulate (inp[0:1]+hash)*r^4
-	################################################################
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	vpmuludq	$H0,$T4,$T0		# h0*r0
-	vpmuludq	$H1,$T4,$T1		# h1*r0
-	vpaddq		$T0,$D0,$D0
-	vpaddq		$T1,$D1,$D1
-	 vmovdqa	-0x80(%r11),$T2		# r1^4
-	vpmuludq	$H2,$T4,$T0		# h2*r0
-	vpmuludq	$H3,$T4,$T1		# h3*r0
-	vpaddq		$T0,$D2,$D2
-	vpaddq		$T1,$D3,$D3
-	vpmuludq	$H4,$T4,$T4		# h4*r0
-	 vpmuludq	-0x70(%r11),$H4,$T0	# h4*s1
-	vpaddq		$T4,$D4,$D4
-
-	vpaddq		$T0,$D0,$D0		# d0 += h4*s1
-	vpmuludq	$H2,$T2,$T1		# h2*r1
-	vpmuludq	$H3,$T2,$T0		# h3*r1
-	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
-	 vmovdqa	-0x60(%r11),$T3		# r2^4
-	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
-	vpmuludq	$H1,$T2,$T1		# h1*r1
-	vpmuludq	$H0,$T2,$T2		# h0*r1
-	vpaddq		$T1,$D2,$D2		# d2 += h1*r1
-	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
-
-	 vmovdqa	-0x50(%r11),$T4		# s2^4
-	vpmuludq	$H2,$T3,$T0		# h2*r2
-	vpmuludq	$H1,$T3,$T1		# h1*r2
-	vpaddq		$T0,$D4,$D4		# d4 += h2*r2
-	vpaddq		$T1,$D3,$D3		# d3 += h1*r2
-	 vmovdqa	-0x40(%r11),$T2		# r3^4
-	vpmuludq	$H0,$T3,$T3		# h0*r2
-	vpmuludq	$H4,$T4,$T0		# h4*s2
-	vpaddq		$T3,$D2,$D2		# d2 += h0*r2
-	vpaddq		$T0,$D1,$D1		# d1 += h4*s2
-	 vmovdqa	-0x30(%r11),$T3		# s3^4
-	vpmuludq	$H3,$T4,$T4		# h3*s2
-	 vpmuludq	$H1,$T2,$T1		# h1*r3
-	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
-
-	 vmovdqa	-0x10(%r11),$T4		# s4^4
-	vpaddq		$T1,$D4,$D4		# d4 += h1*r3
-	vpmuludq	$H0,$T2,$T2		# h0*r3
-	vpmuludq	$H4,$T3,$T0		# h4*s3
-	vpaddq		$T2,$D3,$D3		# d3 += h0*r3
-	vpaddq		$T0,$D2,$D2		# d2 += h4*s3
-	 vmovdqu	16*2($inp),$T0				# load input
-	vpmuludq	$H3,$T3,$T2		# h3*s3
-	vpmuludq	$H2,$T3,$T3		# h2*s3
-	vpaddq		$T2,$D1,$D1		# d1 += h3*s3
-	 vmovdqu	16*3($inp),$T1				#
-	vpaddq		$T3,$D0,$D0		# d0 += h2*s3
-
-	vpmuludq	$H2,$T4,$H2		# h2*s4
-	vpmuludq	$H3,$T4,$H3		# h3*s4
-	 vpsrldq	\$6,$T0,$T2				# splat input
-	vpaddq		$H2,$D1,$D1		# d1 += h2*s4
-	vpmuludq	$H4,$T4,$H4		# h4*s4
-	 vpsrldq	\$6,$T1,$T3				#
-	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*s4
-	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*s4
-	vpmuludq	-0x20(%r11),$H0,$H4	# h0*r4
-	vpmuludq	$H1,$T4,$H0
-	 vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
-	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
-
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	vpunpcklqdq	$T3,$T2,$T3		# 2:3
-
-	#vpsrlq		\$40,$T4,$T4		# 4
-	vpsrldq		\$`40/8`,$T4,$T4	# 4
-	vpsrlq		\$26,$T0,$T1
-	 vmovdqa	0x00(%rsp),$D4		# preload r0^2
-	vpand		$MASK,$T0,$T0		# 0
-	vpsrlq		\$4,$T3,$T2
-	vpand		$MASK,$T1,$T1		# 1
-	vpand		0(%rcx),$T4,$T4		# .Lmask24
-	vpsrlq		\$30,$T3,$T3
-	vpand		$MASK,$T2,$T2		# 2
-	vpand		$MASK,$T3,$T3		# 3
-	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	################################################################
-	# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-	# and P. Schwabe
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$D1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H4,$D0
-	vpand		$MASK,$H4,$H4
-
-	vpsrlq		\$26,$H1,$D1
-	vpand		$MASK,$H1,$H1
-	vpaddq		$D1,$H2,$H2		# h1 -> h2
-
-	vpaddq		$D0,$H0,$H0
-	vpsllq		\$2,$D0,$D0
-	vpaddq		$D0,$H0,$H0		# h4 -> h0
-
-	vpsrlq		\$26,$H2,$D2
-	vpand		$MASK,$H2,$H2
-	vpaddq		$D2,$H3,$H3		# h2 -> h3
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$H1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	ja		.Loop_avx
-
-.Lskip_loop_avx:
-	################################################################
-	# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
-
-	vpshufd		\$0x10,$D4,$D4		# r0^n, xx12 -> x1x2
-	add		\$32,$len
-	jnz		.Long_tail_avx
-
-	vpaddq		$H2,$T2,$T2
-	vpaddq		$H0,$T0,$T0
-	vpaddq		$H1,$T1,$T1
-	vpaddq		$H3,$T3,$T3
-	vpaddq		$H4,$T4,$T4
-
-.Long_tail_avx:
-	vmovdqa		$H2,0x20(%r11)
-	vmovdqa		$H0,0x00(%r11)
-	vmovdqa		$H1,0x10(%r11)
-	vmovdqa		$H3,0x30(%r11)
-	vmovdqa		$H4,0x40(%r11)
-
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-
-	vpmuludq	$T2,$D4,$D2		# d2 = h2*r0
-	vpmuludq	$T0,$D4,$D0		# d0 = h0*r0
-	 vpshufd	\$0x10,`16*1-64`($ctx),$H2		# r1^n
-	vpmuludq	$T1,$D4,$D1		# d1 = h1*r0
-	vpmuludq	$T3,$D4,$D3		# d3 = h3*r0
-	vpmuludq	$T4,$D4,$D4		# d4 = h4*r0
-
-	vpmuludq	$T3,$H2,$H0		# h3*r1
-	vpaddq		$H0,$D4,$D4		# d4 += h3*r1
-	 vpshufd	\$0x10,`16*2-64`($ctx),$H3		# s1^n
-	vpmuludq	$T2,$H2,$H1		# h2*r1
-	vpaddq		$H1,$D3,$D3		# d3 += h2*r1
-	 vpshufd	\$0x10,`16*3-64`($ctx),$H4		# r2^n
-	vpmuludq	$T1,$H2,$H0		# h1*r1
-	vpaddq		$H0,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$T0,$H2,$H2		# h0*r1
-	vpaddq		$H2,$D1,$D1		# d1 += h0*r1
-	vpmuludq	$T4,$H3,$H3		# h4*s1
-	vpaddq		$H3,$D0,$D0		# d0 += h4*s1
-
-	 vpshufd	\$0x10,`16*4-64`($ctx),$H2		# s2^n
-	vpmuludq	$T2,$H4,$H1		# h2*r2
-	vpaddq		$H1,$D4,$D4		# d4 += h2*r2
-	vpmuludq	$T1,$H4,$H0		# h1*r2
-	vpaddq		$H0,$D3,$D3		# d3 += h1*r2
-	 vpshufd	\$0x10,`16*5-64`($ctx),$H3		# r3^n
-	vpmuludq	$T0,$H4,$H4		# h0*r2
-	vpaddq		$H4,$D2,$D2		# d2 += h0*r2
-	vpmuludq	$T4,$H2,$H1		# h4*s2
-	vpaddq		$H1,$D1,$D1		# d1 += h4*s2
-	 vpshufd	\$0x10,`16*6-64`($ctx),$H4		# s3^n
-	vpmuludq	$T3,$H2,$H2		# h3*s2
-	vpaddq		$H2,$D0,$D0		# d0 += h3*s2
-
-	vpmuludq	$T1,$H3,$H0		# h1*r3
-	vpaddq		$H0,$D4,$D4		# d4 += h1*r3
-	vpmuludq	$T0,$H3,$H3		# h0*r3
-	vpaddq		$H3,$D3,$D3		# d3 += h0*r3
-	 vpshufd	\$0x10,`16*7-64`($ctx),$H2		# r4^n
-	vpmuludq	$T4,$H4,$H1		# h4*s3
-	vpaddq		$H1,$D2,$D2		# d2 += h4*s3
-	 vpshufd	\$0x10,`16*8-64`($ctx),$H3		# s4^n
-	vpmuludq	$T3,$H4,$H0		# h3*s3
-	vpaddq		$H0,$D1,$D1		# d1 += h3*s3
-	vpmuludq	$T2,$H4,$H4		# h2*s3
-	vpaddq		$H4,$D0,$D0		# d0 += h2*s3
-
-	vpmuludq	$T0,$H2,$H2		# h0*r4
-	vpaddq		$H2,$D4,$D4		# h4 = d4 + h0*r4
-	vpmuludq	$T4,$H3,$H1		# h4*s4
-	vpaddq		$H1,$D3,$D3		# h3 = d3 + h4*s4
-	vpmuludq	$T3,$H3,$H0		# h3*s4
-	vpaddq		$H0,$D2,$D2		# h2 = d2 + h3*s4
-	vpmuludq	$T2,$H3,$H1		# h2*s4
-	vpaddq		$H1,$D1,$D1		# h1 = d1 + h2*s4
-	vpmuludq	$T1,$H3,$H3		# h1*s4
-	vpaddq		$H3,$D0,$D0		# h0 = d0 + h1*s4
-
-	jz		.Lshort_tail_avx
-
-	vmovdqu		16*0($inp),$H0		# load input
-	vmovdqu		16*1($inp),$H1
-
-	vpsrldq		\$6,$H0,$H2		# splat input
-	vpsrldq		\$6,$H1,$H3
-	vpunpckhqdq	$H1,$H0,$H4		# 4
-	vpunpcklqdq	$H1,$H0,$H0		# 0:1
-	vpunpcklqdq	$H3,$H2,$H3		# 2:3
-
-	vpsrlq		\$40,$H4,$H4		# 4
-	vpsrlq		\$26,$H0,$H1
-	vpand		$MASK,$H0,$H0		# 0
-	vpsrlq		\$4,$H3,$H2
-	vpand		$MASK,$H1,$H1		# 1
-	vpsrlq		\$30,$H3,$H3
-	vpand		$MASK,$H2,$H2		# 2
-	vpand		$MASK,$H3,$H3		# 3
-	vpor		32(%rcx),$H4,$H4	# padbit, yes, always
-
-	vpshufd		\$0x32,`16*0-64`($ctx),$T4	# r0^n, 34xx -> x3x4
-	vpaddq		0x00(%r11),$H0,$H0
-	vpaddq		0x10(%r11),$H1,$H1
-	vpaddq		0x20(%r11),$H2,$H2
-	vpaddq		0x30(%r11),$H3,$H3
-	vpaddq		0x40(%r11),$H4,$H4
-
-	################################################################
-	# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
-
-	vpmuludq	$H0,$T4,$T0		# h0*r0
-	vpaddq		$T0,$D0,$D0		# d0 += h0*r0
-	vpmuludq	$H1,$T4,$T1		# h1*r0
-	vpaddq		$T1,$D1,$D1		# d1 += h1*r0
-	vpmuludq	$H2,$T4,$T0		# h2*r0
-	vpaddq		$T0,$D2,$D2		# d2 += h2*r0
-	 vpshufd	\$0x32,`16*1-64`($ctx),$T2		# r1^n
-	vpmuludq	$H3,$T4,$T1		# h3*r0
-	vpaddq		$T1,$D3,$D3		# d3 += h3*r0
-	vpmuludq	$H4,$T4,$T4		# h4*r0
-	vpaddq		$T4,$D4,$D4		# d4 += h4*r0
-
-	vpmuludq	$H3,$T2,$T0		# h3*r1
-	vpaddq		$T0,$D4,$D4		# d4 += h3*r1
-	 vpshufd	\$0x32,`16*2-64`($ctx),$T3		# s1
-	vpmuludq	$H2,$T2,$T1		# h2*r1
-	vpaddq		$T1,$D3,$D3		# d3 += h2*r1
-	 vpshufd	\$0x32,`16*3-64`($ctx),$T4		# r2
-	vpmuludq	$H1,$T2,$T0		# h1*r1
-	vpaddq		$T0,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$H0,$T2,$T2		# h0*r1
-	vpaddq		$T2,$D1,$D1		# d1 += h0*r1
-	vpmuludq	$H4,$T3,$T3		# h4*s1
-	vpaddq		$T3,$D0,$D0		# d0 += h4*s1
-
-	 vpshufd	\$0x32,`16*4-64`($ctx),$T2		# s2
-	vpmuludq	$H2,$T4,$T1		# h2*r2
-	vpaddq		$T1,$D4,$D4		# d4 += h2*r2
-	vpmuludq	$H1,$T4,$T0		# h1*r2
-	vpaddq		$T0,$D3,$D3		# d3 += h1*r2
-	 vpshufd	\$0x32,`16*5-64`($ctx),$T3		# r3
-	vpmuludq	$H0,$T4,$T4		# h0*r2
-	vpaddq		$T4,$D2,$D2		# d2 += h0*r2
-	vpmuludq	$H4,$T2,$T1		# h4*s2
-	vpaddq		$T1,$D1,$D1		# d1 += h4*s2
-	 vpshufd	\$0x32,`16*6-64`($ctx),$T4		# s3
-	vpmuludq	$H3,$T2,$T2		# h3*s2
-	vpaddq		$T2,$D0,$D0		# d0 += h3*s2
-
-	vpmuludq	$H1,$T3,$T0		# h1*r3
-	vpaddq		$T0,$D4,$D4		# d4 += h1*r3
-	vpmuludq	$H0,$T3,$T3		# h0*r3
-	vpaddq		$T3,$D3,$D3		# d3 += h0*r3
-	 vpshufd	\$0x32,`16*7-64`($ctx),$T2		# r4
-	vpmuludq	$H4,$T4,$T1		# h4*s3
-	vpaddq		$T1,$D2,$D2		# d2 += h4*s3
-	 vpshufd	\$0x32,`16*8-64`($ctx),$T3		# s4
-	vpmuludq	$H3,$T4,$T0		# h3*s3
-	vpaddq		$T0,$D1,$D1		# d1 += h3*s3
-	vpmuludq	$H2,$T4,$T4		# h2*s3
-	vpaddq		$T4,$D0,$D0		# d0 += h2*s3
-
-	vpmuludq	$H0,$T2,$T2		# h0*r4
-	vpaddq		$T2,$D4,$D4		# d4 += h0*r4
-	vpmuludq	$H4,$T3,$T1		# h4*s4
-	vpaddq		$T1,$D3,$D3		# d3 += h4*s4
-	vpmuludq	$H3,$T3,$T0		# h3*s4
-	vpaddq		$T0,$D2,$D2		# d2 += h3*s4
-	vpmuludq	$H2,$T3,$T1		# h2*s4
-	vpaddq		$T1,$D1,$D1		# d1 += h2*s4
-	vpmuludq	$H1,$T3,$T3		# h1*s4
-	vpaddq		$T3,$D0,$D0		# d0 += h1*s4
-
-.Lshort_tail_avx:
-	################################################################
-	# horizontal addition
-
-	vpsrldq		\$8,$D4,$T4
-	vpsrldq		\$8,$D3,$T3
-	vpsrldq		\$8,$D1,$T1
-	vpsrldq		\$8,$D0,$T0
-	vpsrldq		\$8,$D2,$T2
-	vpaddq		$T3,$D3,$D3
-	vpaddq		$T4,$D4,$D4
-	vpaddq		$T0,$D0,$D0
-	vpaddq		$T1,$D1,$D1
-	vpaddq		$T2,$D2,$D2
-
-	################################################################
-	# lazy reduction
-
-	vpsrlq		\$26,$D3,$H3
-	vpand		$MASK,$D3,$D3
-	vpaddq		$H3,$D4,$D4		# h3 -> h4
-
-	vpsrlq		\$26,$D0,$H0
-	vpand		$MASK,$D0,$D0
-	vpaddq		$H0,$D1,$D1		# h0 -> h1
-
-	vpsrlq		\$26,$D4,$H4
-	vpand		$MASK,$D4,$D4
-
-	vpsrlq		\$26,$D1,$H1
-	vpand		$MASK,$D1,$D1
-	vpaddq		$H1,$D2,$D2		# h1 -> h2
-
-	vpaddq		$H4,$D0,$D0
-	vpsllq		\$2,$H4,$H4
-	vpaddq		$H4,$D0,$D0		# h4 -> h0
-
-	vpsrlq		\$26,$D2,$H2
-	vpand		$MASK,$D2,$D2
-	vpaddq		$H2,$D3,$D3		# h2 -> h3
-
-	vpsrlq		\$26,$D0,$H0
-	vpand		$MASK,$D0,$D0
-	vpaddq		$H0,$D1,$D1		# h0 -> h1
-
-	vpsrlq		\$26,$D3,$H3
-	vpand		$MASK,$D3,$D3
-	vpaddq		$H3,$D4,$D4		# h3 -> h4
-
-	vmovd		$D0,`4*0-48-64`($ctx)	# save partially reduced
-	vmovd		$D1,`4*1-48-64`($ctx)
-	vmovd		$D2,`4*2-48-64`($ctx)
-	vmovd		$D3,`4*3-48-64`($ctx)
-	vmovd		$D4,`4*4-48-64`($ctx)
-___
-$code.=<<___	if ($win64);
-	vmovdqa		0x50(%r11),%xmm6
-	vmovdqa		0x60(%r11),%xmm7
-	vmovdqa		0x70(%r11),%xmm8
-	vmovdqa		0x80(%r11),%xmm9
-	vmovdqa		0x90(%r11),%xmm10
-	vmovdqa		0xa0(%r11),%xmm11
-	vmovdqa		0xb0(%r11),%xmm12
-	vmovdqa		0xc0(%r11),%xmm13
-	vmovdqa		0xd0(%r11),%xmm14
-	vmovdqa		0xe0(%r11),%xmm15
-	lea		0xf8(%r11),%rsp
-.Ldo_avx_epilogue:
-___
-$code.=<<___	if (!$win64);
-	lea		0x58(%r11),%rsp
-___
-$code.=<<___;
-	vzeroupper
-	ret
-.size	poly1305_blocks_avx,.-poly1305_blocks_avx
-
-.type	poly1305_emit_avx,\@function,3
-.align	32
-poly1305_emit_avx:
-	cmpl	\$0,20($ctx)	# is_base2_26?
-	je	.Lemit
-
-	mov	0($ctx),%eax	# load hash value base 2^26
-	mov	4($ctx),%ecx
-	mov	8($ctx),%r8d
-	mov	12($ctx),%r11d
-	mov	16($ctx),%r10d
-
-	shl	\$26,%rcx	# base 2^26 -> base 2^64
-	mov	%r8,%r9
-	shl	\$52,%r8
-	add	%rcx,%rax
-	shr	\$12,%r9
-	add	%rax,%r8	# h0
-	adc	\$0,%r9
-
-	shl	\$14,%r11
-	mov	%r10,%rax
-	shr	\$24,%r10
-	add	%r11,%r9
-	shl	\$40,%rax
-	add	%rax,%r9	# h1
-	adc	\$0,%r10	# h2
-
-	mov	%r10,%rax	# could be partially reduced, so reduce
-	mov	%r10,%rcx
-	and	\$3,%r10
-	shr	\$2,%rax
-	and	\$-4,%rcx
-	add	%rcx,%rax
-	add	%rax,%r8
-	adc	\$0,%r9
-
-	mov	%r8,%rax
-	add	\$5,%r8		# compare to modulus
-	mov	%r9,%rcx
-	adc	\$0,%r9
-	adc	\$0,%r10
-	shr	\$2,%r10	# did 130-bit value overfow?
-	cmovnz	%r8,%rax
-	cmovnz	%r9,%rcx
-
-	add	0($nonce),%rax	# accumulate nonce
-	adc	8($nonce),%rcx
-	mov	%rax,0($mac)	# write result
-	mov	%rcx,8($mac)
-
-	ret
-.size	poly1305_emit_avx,.-poly1305_emit_avx
-___
-
-if ($avx>1) {
-my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
-    map("%ymm$_",(0..15));
-my $S4=$MASK;
-
-$code.=<<___;
-.type	poly1305_blocks_avx2,\@function,4
-.align	32
-poly1305_blocks_avx2:
-	mov	20($ctx),%r8d		# is_base2_26
-	cmp	\$128,$len
-	jae	.Lblocks_avx2
-	test	%r8d,%r8d
-	jz	.Lblocks
-
-.Lblocks_avx2:
-	and	\$-16,$len
-	jz	.Lno_data_avx2
-
-	vzeroupper
-
-	test	%r8d,%r8d
-	jz	.Lbase2_64_avx2
-
-	test	\$63,$len
-	jz	.Leven_avx2
-
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lblocks_avx2_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	0($ctx),$d1		# load hash value
-	mov	8($ctx),$d2
-	mov	16($ctx),$h2#d
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	################################# base 2^26 -> base 2^64
-	mov	$d1#d,$h0#d
-	and	\$-1<<31,$d1
-	mov	$d2,$r1			# borrow $r1
-	mov	$d2#d,$h1#d
-	and	\$-1<<31,$d2
-
-	shr	\$6,$d1
-	shl	\$52,$r1
-	add	$d1,$h0
-	shr	\$12,$h1
-	shr	\$18,$d2
-	add	$r1,$h0
-	adc	$d2,$h1
-
-	mov	$h2,$d1
-	shl	\$40,$d1
-	shr	\$24,$h2
-	add	$d1,$h1
-	adc	\$0,$h2			# can be partially reduced...
-
-	mov	\$-4,$d2		# ... so reduce
-	mov	$h2,$d1
-	and	$h2,$d2
-	shr	\$2,$d1
-	and	\$3,$h2
-	add	$d2,$d1			# =*5
-	add	$d1,$h0
-	adc	\$0,$h1
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-.Lbase2_26_pre_avx2:
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-	sub	\$16,%r15
-
-	call	__poly1305_block
-	mov	$r1,%rax
-
-	test	\$63,%r15
-	jnz	.Lbase2_26_pre_avx2
-
-	test	$padbit,$padbit		# if $padbit is zero,
-	jz	.Lstore_base2_64_avx2	# store hash in base 2^64 format
-
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$r0
-	mov	$h1,$r1
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$r0
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$r0,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$r1
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$r1,$h2			# h[4]
-
-	test	%r15,%r15
-	jz	.Lstore_base2_26_avx2
-
-	vmovd	%rax#d,%x#$H0
-	vmovd	%rdx#d,%x#$H1
-	vmovd	$h0#d,%x#$H2
-	vmovd	$h1#d,%x#$H3
-	vmovd	$h2#d,%x#$H4
-	jmp	.Lproceed_avx2
-
-.align	32
-.Lstore_base2_64_avx2:
-	mov	$h0,0($ctx)
-	mov	$h1,8($ctx)
-	mov	$h2,16($ctx)		# note that is_base2_26 is zeroed
-	jmp	.Ldone_avx2
-
-.align	16
-.Lstore_base2_26_avx2:
-	mov	%rax#d,0($ctx)		# store hash value base 2^26
-	mov	%rdx#d,4($ctx)
-	mov	$h0#d,8($ctx)
-	mov	$h1#d,12($ctx)
-	mov	$h2#d,16($ctx)
-.align	16
-.Ldone_avx2:
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rsp
-.Lno_data_avx2:
-.Lblocks_avx2_epilogue:
-	ret
-
-.align	32
-.Lbase2_64_avx2:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-.Lbase2_64_avx2_body:
-
-	mov	$len,%r15		# reassign $len
-
-	mov	24($ctx),$r0		# load r
-	mov	32($ctx),$s1
-
-	mov	0($ctx),$h0		# load hash value
-	mov	8($ctx),$h1
-	mov	16($ctx),$h2#d
-
-	mov	$s1,$r1
-	mov	$s1,%rax
-	shr	\$2,$s1
-	add	$r1,$s1			# s1 = r1 + (r1 >> 2)
-
-	test	\$63,$len
-	jz	.Linit_avx2
-
-.Lbase2_64_pre_avx2:
-	add	0($inp),$h0		# accumulate input
-	adc	8($inp),$h1
-	lea	16($inp),$inp
-	adc	$padbit,$h2
-	sub	\$16,%r15
-
-	call	__poly1305_block
-	mov	$r1,%rax
-
-	test	\$63,%r15
-	jnz	.Lbase2_64_pre_avx2
-
-.Linit_avx2:
-	################################# base 2^64 -> base 2^26
-	mov	$h0,%rax
-	mov	$h0,%rdx
-	shr	\$52,$h0
-	mov	$h1,$d1
-	mov	$h1,$d2
-	shr	\$26,%rdx
-	and	\$0x3ffffff,%rax	# h[0]
-	shl	\$12,$d1
-	and	\$0x3ffffff,%rdx	# h[1]
-	shr	\$14,$h1
-	or	$d1,$h0
-	shl	\$24,$h2
-	and	\$0x3ffffff,$h0		# h[2]
-	shr	\$40,$d2
-	and	\$0x3ffffff,$h1		# h[3]
-	or	$d2,$h2			# h[4]
-
-	vmovd	%rax#d,%x#$H0
-	vmovd	%rdx#d,%x#$H1
-	vmovd	$h0#d,%x#$H2
-	vmovd	$h1#d,%x#$H3
-	vmovd	$h2#d,%x#$H4
-	movl	\$1,20($ctx)		# set is_base2_26
-
-	call	__poly1305_init_avx
-
-.Lproceed_avx2:
-	mov	%r15,$len
-
-	mov	0(%rsp),%r15
-	mov	8(%rsp),%r14
-	mov	16(%rsp),%r13
-	mov	24(%rsp),%r12
-	mov	32(%rsp),%rbp
-	mov	40(%rsp),%rbx
-	lea	48(%rsp),%rax
-	lea	48(%rsp),%rsp
-.Lbase2_64_avx2_epilogue:
-	jmp	.Ldo_avx2
-
-.align	32
-.Leven_avx2:
-	vmovd		4*0($ctx),%x#$H0	# load hash value base 2^26
-	vmovd		4*1($ctx),%x#$H1
-	vmovd		4*2($ctx),%x#$H2
-	vmovd		4*3($ctx),%x#$H3
-	vmovd		4*4($ctx),%x#$H4
-
-.Ldo_avx2:
-___
-$code.=<<___	if (!$win64);
-	lea		-8(%rsp),%r11
-	sub		\$0x128,%rsp
-___
-$code.=<<___	if ($win64);
-	lea		-0xf8(%rsp),%r11
-	sub		\$0x1c8,%rsp
-	vmovdqa		%xmm6,0x50(%r11)
-	vmovdqa		%xmm7,0x60(%r11)
-	vmovdqa		%xmm8,0x70(%r11)
-	vmovdqa		%xmm9,0x80(%r11)
-	vmovdqa		%xmm10,0x90(%r11)
-	vmovdqa		%xmm11,0xa0(%r11)
-	vmovdqa		%xmm12,0xb0(%r11)
-	vmovdqa		%xmm13,0xc0(%r11)
-	vmovdqa		%xmm14,0xd0(%r11)
-	vmovdqa		%xmm15,0xe0(%r11)
-.Ldo_avx2_body:
-___
-$code.=<<___;
-	lea		48+64($ctx),$ctx	# size optimization
-	lea		.Lconst(%rip),%rcx
-
-	# expand and copy pre-calculated table to stack
-	vmovdqu		`16*0-64`($ctx),%x#$T2
-	and		\$-512,%rsp
-	vmovdqu		`16*1-64`($ctx),%x#$T3
-	vmovdqu		`16*2-64`($ctx),%x#$T4
-	vmovdqu		`16*3-64`($ctx),%x#$D0
-	vmovdqu		`16*4-64`($ctx),%x#$D1
-	vmovdqu		`16*5-64`($ctx),%x#$D2
-	vmovdqu		`16*6-64`($ctx),%x#$D3
-	vpermq		\$0x15,$T2,$T2		# 00003412 -> 12343434
-	vmovdqu		`16*7-64`($ctx),%x#$D4
-	vpermq		\$0x15,$T3,$T3
-	vpshufd		\$0xc8,$T2,$T2		# 12343434 -> 14243444
-	vmovdqu		`16*8-64`($ctx),%x#$MASK
-	vpermq		\$0x15,$T4,$T4
-	vpshufd		\$0xc8,$T3,$T3
-	vmovdqa		$T2,0x00(%rsp)
-	vpermq		\$0x15,$D0,$D0
-	vpshufd		\$0xc8,$T4,$T4
-	vmovdqa		$T3,0x20(%rsp)
-	vpermq		\$0x15,$D1,$D1
-	vpshufd		\$0xc8,$D0,$D0
-	vmovdqa		$T4,0x40(%rsp)
-	vpermq		\$0x15,$D2,$D2
-	vpshufd		\$0xc8,$D1,$D1
-	vmovdqa		$D0,0x60(%rsp)
-	vpermq		\$0x15,$D3,$D3
-	vpshufd		\$0xc8,$D2,$D2
-	vmovdqa		$D1,0x80(%rsp)
-	vpermq		\$0x15,$D4,$D4
-	vpshufd		\$0xc8,$D3,$D3
-	vmovdqa		$D2,0xa0(%rsp)
-	vpermq		\$0x15,$MASK,$MASK
-	vpshufd		\$0xc8,$D4,$D4
-	vmovdqa		$D3,0xc0(%rsp)
-	vpshufd		\$0xc8,$MASK,$MASK
-	vmovdqa		$D4,0xe0(%rsp)
-	vmovdqa		$MASK,0x100(%rsp)
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-
-	################################################################
-	# load input
-	vmovdqu		16*0($inp),%x#$T0
-	vmovdqu		16*1($inp),%x#$T1
-	vinserti128	\$1,16*2($inp),$T0,$T0
-	vinserti128	\$1,16*3($inp),$T1,$T1
-	lea		16*4($inp),$inp
-
-	vpsrldq		\$6,$T0,$T2		# splat input
-	vpsrldq		\$6,$T1,$T3
-	vpunpckhqdq	$T1,$T0,$T4		# 4
-	vpunpcklqdq	$T3,$T2,$T2		# 2:3
-	vpunpcklqdq	$T1,$T0,$T0		# 0:1
-
-	vpsrlq		\$30,$T2,$T3
-	vpsrlq		\$4,$T2,$T2
-	vpsrlq		\$26,$T0,$T1
-	vpsrlq		\$40,$T4,$T4		# 4
-	vpand		$MASK,$T2,$T2		# 2
-	vpand		$MASK,$T0,$T0		# 0
-	vpand		$MASK,$T1,$T1		# 1
-	vpand		$MASK,$T3,$T3		# 3
-	vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	lea		0x90(%rsp),%rax		# size optimization
-	vpaddq		$H2,$T2,$H2		# accumulate input
-	sub		\$64,$len
-	jz		.Ltail_avx2
-	jmp		.Loop_avx2
-
-.align	32
-.Loop_avx2:
-	################################################################
-	# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
-	# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
-	# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
-	# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
-	#   \________/\________/
-	################################################################
-	#vpaddq		$H2,$T2,$H2		# accumulate input
-	vpaddq		$H0,$T0,$H0
-	vmovdqa		`32*0`(%rsp),$T0	# r0^4
-	vpaddq		$H1,$T1,$H1
-	vmovdqa		`32*1`(%rsp),$T1	# r1^4
-	vpaddq		$H3,$T3,$H3
-	vmovdqa		`32*3`(%rsp),$T2	# r2^4
-	vpaddq		$H4,$T4,$H4
-	vmovdqa		`32*6-0x90`(%rax),$T3	# s3^4
-	vmovdqa		`32*8-0x90`(%rax),$S4	# s4^4
-
-	# d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
-	# d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
-	# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
-	#
-	# however, as h2 is "chronologically" first one available pull
-	# corresponding operations up, so it's
-	#
-	# d4 = h2*r2   + h4*r0 + h3*r1             + h1*r3   + h0*r4
-	# d3 = h2*r1   + h3*r0           + h1*r2   + h0*r3   + h4*5*r4
-	# d2 = h2*r0           + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
-	# d1 = h2*5*r4 + h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3
-	# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2           + h1*5*r4
-
-	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
-	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
-	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
-	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
-	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
-
-	vpmuludq	$H0,$T1,$T4		# h0*r1
-	vpmuludq	$H1,$T1,$H2		# h1*r1, borrow $H2 as temp
-	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
-	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$H3,$T1,$T4		# h3*r1
-	vpmuludq	`32*2`(%rsp),$H4,$H2	# h4*s1
-	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
-	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
-	 vmovdqa	`32*4-0x90`(%rax),$T1	# s2
-
-	vpmuludq	$H0,$T0,$T4		# h0*r0
-	vpmuludq	$H1,$T0,$H2		# h1*r0
-	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
-	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
-	vpmuludq	$H3,$T0,$T4		# h3*r0
-	vpmuludq	$H4,$T0,$H2		# h4*r0
-	 vmovdqu	16*0($inp),%x#$T0	# load input
-	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
-	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
-	 vinserti128	\$1,16*2($inp),$T0,$T0
-
-	vpmuludq	$H3,$T1,$T4		# h3*s2
-	vpmuludq	$H4,$T1,$H2		# h4*s2
-	 vmovdqu	16*1($inp),%x#$T1
-	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
-	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
-	 vmovdqa	`32*5-0x90`(%rax),$H2	# r3
-	vpmuludq	$H1,$T2,$T4		# h1*r2
-	vpmuludq	$H0,$T2,$T2		# h0*r2
-	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
-	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
-	 vinserti128	\$1,16*3($inp),$T1,$T1
-	 lea		16*4($inp),$inp
-
-	vpmuludq	$H1,$H2,$T4		# h1*r3
-	vpmuludq	$H0,$H2,$H2		# h0*r3
-	 vpsrldq	\$6,$T0,$T2		# splat input
-	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
-	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
-	vpmuludq	$H3,$T3,$T4		# h3*s3
-	vpmuludq	$H4,$T3,$H2		# h4*s3
-	 vpsrldq	\$6,$T1,$T3
-	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
-	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
-	 vpunpckhqdq	$T1,$T0,$T4		# 4
-
-	vpmuludq	$H3,$S4,$H3		# h3*s4
-	vpmuludq	$H4,$S4,$H4		# h4*s4
-	 vpunpcklqdq	$T1,$T0,$T0		# 0:1
-	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
-	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
-	 vpunpcklqdq	$T3,$T2,$T3		# 2:3
-	vpmuludq	`32*7-0x90`(%rax),$H0,$H4	# h0*r4
-	vpmuludq	$H1,$S4,$H0		# h1*s4
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
-	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
-
-	################################################################
-	# lazy reduction (interleaved with tail of input splat)
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$D1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H4,$D4
-	vpand		$MASK,$H4,$H4
-
-	 vpsrlq		\$4,$T3,$T2
-
-	vpsrlq		\$26,$H1,$D1
-	vpand		$MASK,$H1,$H1
-	vpaddq		$D1,$H2,$H2		# h1 -> h2
-
-	vpaddq		$D4,$H0,$H0
-	vpsllq		\$2,$D4,$D4
-	vpaddq		$D4,$H0,$H0		# h4 -> h0
-
-	 vpand		$MASK,$T2,$T2		# 2
-	 vpsrlq		\$26,$T0,$T1
-
-	vpsrlq		\$26,$H2,$D2
-	vpand		$MASK,$H2,$H2
-	vpaddq		$D2,$H3,$H3		# h2 -> h3
-
-	 vpaddq		$T2,$H2,$H2		# modulo-scheduled
-	 vpsrlq		\$30,$T3,$T3
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$H1,$H1		# h0 -> h1
-
-	 vpsrlq		\$40,$T4,$T4		# 4
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	 vpand		$MASK,$T0,$T0		# 0
-	 vpand		$MASK,$T1,$T1		# 1
-	 vpand		$MASK,$T3,$T3		# 3
-	 vpor		32(%rcx),$T4,$T4	# padbit, yes, always
-
-	sub		\$64,$len
-	jnz		.Loop_avx2
-
-	.byte		0x66,0x90
-.Ltail_avx2:
-	################################################################
-	# while above multiplications were by r^4 in all lanes, in last
-	# iteration we multiply least significant lane by r^4 and most
-	# significant one by r, so copy of above except that references
-	# to the precomputed table are displaced by 4...
-
-	#vpaddq		$H2,$T2,$H2		# accumulate input
-	vpaddq		$H0,$T0,$H0
-	vmovdqu		`32*0+4`(%rsp),$T0	# r0^4
-	vpaddq		$H1,$T1,$H1
-	vmovdqu		`32*1+4`(%rsp),$T1	# r1^4
-	vpaddq		$H3,$T3,$H3
-	vmovdqu		`32*3+4`(%rsp),$T2	# r2^4
-	vpaddq		$H4,$T4,$H4
-	vmovdqu		`32*6+4-0x90`(%rax),$T3	# s3^4
-	vmovdqu		`32*8+4-0x90`(%rax),$S4	# s4^4
-
-	vpmuludq	$H2,$T0,$D2		# d2 = h2*r0
-	vpmuludq	$H2,$T1,$D3		# d3 = h2*r1
-	vpmuludq	$H2,$T2,$D4		# d4 = h2*r2
-	vpmuludq	$H2,$T3,$D0		# d0 = h2*s3
-	vpmuludq	$H2,$S4,$D1		# d1 = h2*s4
-
-	vpmuludq	$H0,$T1,$T4		# h0*r1
-	vpmuludq	$H1,$T1,$H2		# h1*r1
-	vpaddq		$T4,$D1,$D1		# d1 += h0*r1
-	vpaddq		$H2,$D2,$D2		# d2 += h1*r1
-	vpmuludq	$H3,$T1,$T4		# h3*r1
-	vpmuludq	`32*2+4`(%rsp),$H4,$H2	# h4*s1
-	vpaddq		$T4,$D4,$D4		# d4 += h3*r1
-	vpaddq		$H2,$D0,$D0		# d0 += h4*s1
-
-	vpmuludq	$H0,$T0,$T4		# h0*r0
-	vpmuludq	$H1,$T0,$H2		# h1*r0
-	vpaddq		$T4,$D0,$D0		# d0 += h0*r0
-	 vmovdqu	`32*4+4-0x90`(%rax),$T1	# s2
-	vpaddq		$H2,$D1,$D1		# d1 += h1*r0
-	vpmuludq	$H3,$T0,$T4		# h3*r0
-	vpmuludq	$H4,$T0,$H2		# h4*r0
-	vpaddq		$T4,$D3,$D3		# d3 += h3*r0
-	vpaddq		$H2,$D4,$D4		# d4 += h4*r0
-
-	vpmuludq	$H3,$T1,$T4		# h3*s2
-	vpmuludq	$H4,$T1,$H2		# h4*s2
-	vpaddq		$T4,$D0,$D0		# d0 += h3*s2
-	vpaddq		$H2,$D1,$D1		# d1 += h4*s2
-	 vmovdqu	`32*5+4-0x90`(%rax),$H2	# r3
-	vpmuludq	$H1,$T2,$T4		# h1*r2
-	vpmuludq	$H0,$T2,$T2		# h0*r2
-	vpaddq		$T4,$D3,$D3		# d3 += h1*r2
-	vpaddq		$T2,$D2,$D2		# d2 += h0*r2
-
-	vpmuludq	$H1,$H2,$T4		# h1*r3
-	vpmuludq	$H0,$H2,$H2		# h0*r3
-	vpaddq		$T4,$D4,$D4		# d4 += h1*r3
-	vpaddq		$H2,$D3,$D3		# d3 += h0*r3
-	vpmuludq	$H3,$T3,$T4		# h3*s3
-	vpmuludq	$H4,$T3,$H2		# h4*s3
-	vpaddq		$T4,$D1,$D1		# d1 += h3*s3
-	vpaddq		$H2,$D2,$D2		# d2 += h4*s3
-
-	vpmuludq	$H3,$S4,$H3		# h3*s4
-	vpmuludq	$H4,$S4,$H4		# h4*s4
-	vpaddq		$H3,$D2,$H2		# h2 = d2 + h3*r4
-	vpaddq		$H4,$D3,$H3		# h3 = d3 + h4*r4
-	vpmuludq	`32*7+4-0x90`(%rax),$H0,$H4		# h0*r4
-	vpmuludq	$H1,$S4,$H0		# h1*s4
-	vmovdqa		64(%rcx),$MASK		# .Lmask26
-	vpaddq		$H4,$D4,$H4		# h4 = d4 + h0*r4
-	vpaddq		$H0,$D0,$H0		# h0 = d0 + h1*s4
-
-	################################################################
-	# horizontal addition
-
-	vpsrldq		\$8,$D1,$T1
-	vpsrldq		\$8,$H2,$T2
-	vpsrldq		\$8,$H3,$T3
-	vpsrldq		\$8,$H4,$T4
-	vpsrldq		\$8,$H0,$T0
-	vpaddq		$T1,$D1,$D1
-	vpaddq		$T2,$H2,$H2
-	vpaddq		$T3,$H3,$H3
-	vpaddq		$T4,$H4,$H4
-	vpaddq		$T0,$H0,$H0
-
-	vpermq		\$0x2,$H3,$T3
-	vpermq		\$0x2,$H4,$T4
-	vpermq		\$0x2,$H0,$T0
-	vpermq		\$0x2,$D1,$T1
-	vpermq		\$0x2,$H2,$T2
-	vpaddq		$T3,$H3,$H3
-	vpaddq		$T4,$H4,$H4
-	vpaddq		$T0,$H0,$H0
-	vpaddq		$T1,$D1,$D1
-	vpaddq		$T2,$H2,$H2
-
-	################################################################
-	# lazy reduction
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$D1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H4,$D4
-	vpand		$MASK,$H4,$H4
-
-	vpsrlq		\$26,$H1,$D1
-	vpand		$MASK,$H1,$H1
-	vpaddq		$D1,$H2,$H2		# h1 -> h2
-
-	vpaddq		$D4,$H0,$H0
-	vpsllq		\$2,$D4,$D4
-	vpaddq		$D4,$H0,$H0		# h4 -> h0
-
-	vpsrlq		\$26,$H2,$D2
-	vpand		$MASK,$H2,$H2
-	vpaddq		$D2,$H3,$H3		# h2 -> h3
-
-	vpsrlq		\$26,$H0,$D0
-	vpand		$MASK,$H0,$H0
-	vpaddq		$D0,$H1,$H1		# h0 -> h1
-
-	vpsrlq		\$26,$H3,$D3
-	vpand		$MASK,$H3,$H3
-	vpaddq		$D3,$H4,$H4		# h3 -> h4
-
-	vmovd		%x#$H0,`4*0-48-64`($ctx)# save partially reduced
-	vmovd		%x#$H1,`4*1-48-64`($ctx)
-	vmovd		%x#$H2,`4*2-48-64`($ctx)
-	vmovd		%x#$H3,`4*3-48-64`($ctx)
-	vmovd		%x#$H4,`4*4-48-64`($ctx)
-___
-$code.=<<___	if ($win64);
-	vmovdqa		0x50(%r11),%xmm6
-	vmovdqa		0x60(%r11),%xmm7
-	vmovdqa		0x70(%r11),%xmm8
-	vmovdqa		0x80(%r11),%xmm9
-	vmovdqa		0x90(%r11),%xmm10
-	vmovdqa		0xa0(%r11),%xmm11
-	vmovdqa		0xb0(%r11),%xmm12
-	vmovdqa		0xc0(%r11),%xmm13
-	vmovdqa		0xd0(%r11),%xmm14
-	vmovdqa		0xe0(%r11),%xmm15
-	lea		0xf8(%r11),%rsp
-.Ldo_avx2_epilogue:
-___
-$code.=<<___	if (!$win64);
-	lea		8(%r11),%rsp
-___
-$code.=<<___;
-	vzeroupper
-	ret
-.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
-___
-}
-$code.=<<___;
-.align	64
-.Lconst:
-.Lmask24:
-.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long	1<<24,0,1<<24,0,1<<24,0,1<<24,0
-.Lmask26:
-.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lfive:
-.long	5,0,5,0,5,0,5,0
-___
-}
-
-$code.=<<___;
-.asciz	"Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align	16
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
-.align	16
-se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lcommon_seh_tail
-
-	lea	48(%rax),%rax
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	-32(%rax),%r13
-	mov	-40(%rax),%r14
-	mov	-48(%rax),%r15
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-	mov	%r13,224($context)	# restore context->R13
-	mov	%r14,232($context)	# restore context->R14
-	mov	%r15,240($context)	# restore context->R14
-
-	jmp	.Lcommon_seh_tail
-.size	se_handler,.-se_handler
-
-.type	avx_handler,\@abi-omnipotent
-.align	16
-avx_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lcommon_seh_tail
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lcommon_seh_tail
-
-	mov	208($context),%rax	# pull context->R11
-
-	lea	0x50(%rax),%rsi
-	lea	0xf8(%rax),%rax
-	lea	512($context),%rdi	# &context.Xmm6
-	mov	\$20,%ecx
-	.long	0xa548f3fc		# cld; rep movsq
-
-.Lcommon_seh_tail:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$154,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	avx_handler,.-avx_handler
-
-.section	.pdata
-.align	4
-	.rva	.LSEH_begin_poly1305_init
-	.rva	.LSEH_end_poly1305_init
-	.rva	.LSEH_info_poly1305_init
-
-	.rva	.LSEH_begin_poly1305_blocks
-	.rva	.LSEH_end_poly1305_blocks
-	.rva	.LSEH_info_poly1305_blocks
-
-	.rva	.LSEH_begin_poly1305_emit
-	.rva	.LSEH_end_poly1305_emit
-	.rva	.LSEH_info_poly1305_emit
-___
-$code.=<<___ if ($avx);
-	.rva	.LSEH_begin_poly1305_blocks_avx
-	.rva	.Lbase2_64_avx
-	.rva	.LSEH_info_poly1305_blocks_avx_1
-
-	.rva	.Lbase2_64_avx
-	.rva	.Leven_avx
-	.rva	.LSEH_info_poly1305_blocks_avx_2
-
-	.rva	.Leven_avx
-	.rva	.LSEH_end_poly1305_blocks_avx
-	.rva	.LSEH_info_poly1305_blocks_avx_3
-
-	.rva	.LSEH_begin_poly1305_emit_avx
-	.rva	.LSEH_end_poly1305_emit_avx
-	.rva	.LSEH_info_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-	.rva	.LSEH_begin_poly1305_blocks_avx2
-	.rva	.Lbase2_64_avx2
-	.rva	.LSEH_info_poly1305_blocks_avx2_1
-
-	.rva	.Lbase2_64_avx2
-	.rva	.Leven_avx2
-	.rva	.LSEH_info_poly1305_blocks_avx2_2
-
-	.rva	.Leven_avx2
-	.rva	.LSEH_end_poly1305_blocks_avx2
-	.rva	.LSEH_info_poly1305_blocks_avx2_3
-___
-$code.=<<___;
-.section	.xdata
-.align	8
-.LSEH_info_poly1305_init:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
-
-.LSEH_info_poly1305_blocks:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lblocks_body,.Lblocks_epilogue
-
-.LSEH_info_poly1305_emit:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
-___
-$code.=<<___ if ($avx);
-.LSEH_info_poly1305_blocks_avx_1:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lblocks_avx_body,.Lblocks_avx_epilogue		# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_2:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lbase2_64_avx_body,.Lbase2_64_avx_epilogue	# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx_3:
-	.byte	9,0,0,0
-	.rva	avx_handler
-	.rva	.Ldo_avx_body,.Ldo_avx_epilogue			# HandlerData[]
-
-.LSEH_info_poly1305_emit_avx:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
-___
-$code.=<<___ if ($avx>1);
-.LSEH_info_poly1305_blocks_avx2_1:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lblocks_avx2_body,.Lblocks_avx2_epilogue	# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_2:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue	# HandlerData[]
-
-.LSEH_info_poly1305_blocks_avx2_3:
-	.byte	9,0,0,0
-	.rva	avx_handler
-	.rva	.Ldo_avx2_body,.Ldo_avx2_epilogue		# HandlerData[]
-___
-}
-
-foreach (split('\n',$code)) {
-	s/\`([^\`]*)\`/eval($1)/ge;
-	s/%r([a-z]+)#d/%e$1/g;
-	s/%r([0-9]+)#d/%r$1d/g;
-	s/%x#%y/%x/g;
-
-	print $_,"\n";
-}
-close STDOUT;
diff --git a/src/crypto/test/CMakeLists.txt b/src/crypto/test/CMakeLists.txt
index 8c75314..8857913 100644
--- a/src/crypto/test/CMakeLists.txt
+++ b/src/crypto/test/CMakeLists.txt
@@ -7,3 +7,11 @@
   malloc.cc
   test_util.cc
 )
+
+add_library(
+  gtest_main
+
+  OBJECT
+
+  gtest_main.cc
+)
diff --git a/src/crypto/test/gtest_main.cc b/src/crypto/test/gtest_main.cc
new file mode 100644
index 0000000..50147bc
--- /dev/null
+++ b/src/crypto/test/gtest_main.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <gtest/gtest.h>
+
+#include <openssl/crypto.h>
+
+int main(int argc, char **argv) {
+  CRYPTO_library_init();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}