ROUNDZ evaluation stubs

PiperOrigin-RevId: 311256662
diff --git a/BUILD.bazel b/BUILD.bazel
index 96fc30e..13f52ca 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -236,6 +236,8 @@
     "src/math/expminus-scalar-lut64-p2.c",
     "src/math/expminus-scalar-p5.c",
     "src/math/roundne-scalar-addsub.c",
+    "src/math/roundz-scalar-addsub.c",
+    "src/math/roundz-scalar-cvt.c",
     "src/math/sigmoid-scalar-lut2048-p1-div.c",
     "src/math/sigmoid-scalar-lut64-p2-div.c",
     "src/math/sigmoid-scalar-p5-div.c",
@@ -498,6 +500,7 @@
     "src/f32-sigmoid/gen/psimd-p5-div-x20.c",
     "src/f32-sigmoid/gen/psimd-p5-div-x24.c",
     "src/math/roundne-psimd-addsub.c",
+    "src/math/roundz-psimd-addsub.c",
     "src/math/sigmoid-psimd-p5-div.c",
 ]
 
@@ -673,6 +676,8 @@
     "src/x8-zip/x4-neon.c",
     "src/x8-zip/xm-neon.c",
     "src/math/roundne-neon-addsub.c",
+    "src/math/roundz-neon-addsub.c",
+    "src/math/roundz-neon-cvt.c",
     "src/math/sigmoid-neon-frac-p9-p10-nr1recps.c",
     "src/math/sigmoid-neon-rr1-lut2048-p1-nr2recps.c",
     "src/math/sigmoid-neon-rr1-lut64-p2-nr2recps.c",
@@ -917,6 +922,7 @@
 
 NEONV8_UKERNELS = [
     "src/math/roundne-neonv8.c",
+    "src/math/roundz-neonv8.c",
 ]
 
 AARCH64_NEONFP16ARITH_UKERNELS = [
@@ -1073,6 +1079,7 @@
     "src/f32-vmulcaddc/gen/c8-minmax-sse-2x.c",
     "src/x32-packx/x4-sse.c",
     "src/math/roundne-sse-addsub.c",
+    "src/math/roundz-sse-addsub.c",
 ]
 
 SSE2_UKERNELS = [
@@ -1124,6 +1131,7 @@
     "src/math/exp-sse2-p5.c",
     "src/math/expminus-sse2-p5.c",
     "src/math/roundne-sse2-cvt.c",
+    "src/math/roundz-sse2-cvt.c",
     "src/math/sigmoid-sse2-p5-div.c",
     "src/requantization/precise-sse2.c",
     "src/requantization/fp32-sse2.c",
@@ -1147,6 +1155,7 @@
     "src/f32-sigmoid/gen/sse41-p5-div-x20.c",
     "src/f32-sigmoid/gen/sse41-p5-div-x24.c",
     "src/math/roundne-sse41.c",
+    "src/math/roundz-sse41.c",
     "src/requantization/precise-sse4.c",
     "src/requantization/q31-sse4.c",
     "src/requantization/gemmlowp-sse4.c",
@@ -2603,6 +2612,19 @@
     deps = MICROKERNEL_TEST_DEPS,
 )
 
+xnnpack_unit_test(
+    name = "f32_roundz_eval",
+    srcs = [
+        "eval/f32-roundz.cc",
+        "src/xnnpack/AlignedAllocator.h",
+        "src/xnnpack/math-stubs.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    tags = [
+        "notap",
+    ],
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
 xnnpack_benchmark(
     name = "f32_sigmoid_eval",
     srcs = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5aee75c..a54578b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -348,6 +348,8 @@
   src/math/expminus-scalar-lut64-p2.c
   src/math/expminus-scalar-p5.c
   src/math/roundne-scalar-addsub.c
+  src/math/roundz-scalar-addsub.c
+  src/math/roundz-scalar-cvt.c
   src/math/sigmoid-scalar-lut2048-p1-div.c
   src/math/sigmoid-scalar-lut64-p2-div.c
   src/math/sigmoid-scalar-p5-div.c
@@ -504,6 +506,7 @@
   src/f32-sigmoid/gen/psimd-p5-div-x20.c
   src/f32-sigmoid/gen/psimd-p5-div-x24.c
   src/math/roundne-psimd-addsub.c
+  src/math/roundz-psimd-addsub.c
   src/math/sigmoid-psimd-p5-div.c)
 
 SET(XNNPACK_NEON_MICROKERNEL_SRCS
@@ -677,6 +680,8 @@
   src/x8-zip/x4-neon.c
   src/x8-zip/xm-neon.c
   src/math/roundne-neon-addsub.c
+  src/math/roundz-neon-addsub.c
+  src/math/roundz-neon-cvt.c
   src/math/sigmoid-neon-frac-p9-p10-nr1recps.c
   src/math/sigmoid-neon-rr1-lut2048-p1-nr2recps.c
   src/math/sigmoid-neon-rr1-lut64-p2-nr2recps.c
@@ -840,7 +845,8 @@
   src/math/sigmoid-neonfma-rr2-p5-nr2recps.c)
 
 SET(XNNPACK_NEONV8_MICROKERNEL_SRCS
-  src/math/roundne-neonv8.c)
+  src/math/roundne-neonv8.c
+  src/math/roundz-neonv8.c)
 
 SET(XNNPACK_AARCH64_NEONFMA_MICROKERNEL_SRCS
   src/f32-vbinary/gen/vdiv-minmax-neon-x4.c
@@ -1071,7 +1077,8 @@
   src/f32-vmulcaddc/gen/c4-minmax-sse-2x.c
   src/f32-vmulcaddc/gen/c8-minmax-sse-2x.c
   src/x32-packx/x4-sse.c
-  src/math/roundne-sse-addsub.c)
+  src/math/roundne-sse-addsub.c
+  src/math/roundz-sse-addsub.c)
 
 SET(XNNPACK_SSE2_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-sse2-c4.c
@@ -1122,6 +1129,7 @@
   src/math/exp-sse2-p5.c
   src/math/expminus-sse2-p5.c
   src/math/roundne-sse2-cvt.c
+  src/math/roundz-sse2-cvt.c
   src/math/sigmoid-sse2-p5-div.c
   src/requantization/precise-sse2.c
   src/requantization/fp32-sse2.c
@@ -1143,6 +1151,7 @@
   src/f32-sigmoid/gen/sse41-p5-div-x20.c
   src/f32-sigmoid/gen/sse41-p5-div-x24.c
   src/math/roundne-sse41.c
+  src/math/roundz-sse41.c
   src/requantization/precise-sse4.c
   src/requantization/q31-sse4.c
   src/requantization/gemmlowp-sse4.c)
@@ -2895,6 +2904,14 @@
   TARGET_INCLUDE_DIRECTORIES(f32-roundne-eval PRIVATE src)
   TARGET_LINK_LIBRARIES(f32-roundne-eval PRIVATE XNNPACK fp16 gtest gtest_main)
 
+  ADD_EXECUTABLE(f32-roundz-eval eval/f32-roundz.cc)
+  SET_TARGET_PROPERTIES(f32-roundz-eval PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS NO)
+  TARGET_INCLUDE_DIRECTORIES(f32-roundz-eval PRIVATE src)
+  TARGET_LINK_LIBRARIES(f32-roundz-eval PRIVATE XNNPACK fp16 gtest gtest_main)
+
   ADD_EXECUTABLE(f32-sigmoid-eval eval/f32-sigmoid.cc)
   SET_TARGET_PROPERTIES(f32-sigmoid-eval PROPERTIES
     CXX_STANDARD 11
diff --git a/eval/f32-roundz.cc b/eval/f32-roundz.cc
new file mode 100644
index 0000000..dd288b3
--- /dev/null
+++ b/eval/f32-roundz.cc
@@ -0,0 +1,1874 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iomanip>
+#include <ios>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include <fp16.h>
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+constexpr int kBlockSize = 1024;
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(ROUNDZ__SSE_ADDSUB, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE_ADDSUB, negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(ROUNDZ__SSE2_CVT, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, DISABLED_positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE2_CVT, DISABLED_negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(ROUNDZ__SSE41, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__SSE41, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__SSE41, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__SSE41, negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(ROUNDZ__NEON_ADDSUB, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_ADDSUB, negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(ROUNDZ__NEON_CVT, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__NEON_CVT, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__NEON_CVT, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEON_CVT, negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(ROUNDZ__NEONV8, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__NEONV8, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__NEONV8, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__NEONV8, negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+  TEST(ROUNDZ__PSIMD_ADDSUB, positive_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, negative_normal) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, positive_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, negative_integral) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, positive_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+    xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, negative_infinity) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+    xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+    ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+      << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+      << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+      << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, positive_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(n + i);
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, negative_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, positive_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, negative_snan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, positive_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+
+  TEST(ROUNDZ__PSIMD_ADDSUB, negative_snan_to_qnan) {
+    std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+    std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+    for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+      }
+      xnn_math_f32_roundz__psimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+      for (uint32_t i = 0; i < kBlockSize; i++) {
+        const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+        ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+          << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+          << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+          << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+      }
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+
+TEST(ROUNDZ__SCALAR_ADDSUB, positive_normal) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, negative_normal) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, positive_integral) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, negative_integral) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, positive_infinity) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+  xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+  const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+  ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+    << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+    << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+    << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, negative_infinity) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+  xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+  const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+  ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+    << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+    << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+    << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, positive_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, negative_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, positive_snan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, negative_snan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, positive_snan_to_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_ADDSUB, negative_snan_to_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, positive_normal) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, negative_normal) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, positive_integral) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, negative_integral) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, positive_infinity) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  std::fill(inputs.begin(), inputs.end(), UINT32_C(0x7F800000));
+  xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+  const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+  ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+    << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+    << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+    << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+}
+
+TEST(ROUNDZ__SCALAR_CVT, negative_infinity) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  std::fill(inputs.begin(), inputs.end(), UINT32_C(0xFF800000));
+  xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+  const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
+  ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
+    << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
+    << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+    << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
+}
+
+TEST(ROUNDZ__SCALAR_CVT, positive_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(n + i);
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, negative_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, positive_snan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, negative_snan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, DISABLED_positive_snan_to_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
+
+TEST(ROUNDZ__SCALAR_CVT, DISABLED_negative_snan_to_qnan) {
+  std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
+  std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
+  for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
+    }
+    xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
+    for (uint32_t i = 0; i < kBlockSize; i++) {
+      const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
+      ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
+        << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
+        << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
+        << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
+    }
+  }
+}
diff --git a/src/math/roundz-neon-addsub.c b/src/math/roundz-neon-addsub.c
new file mode 100644
index 0000000..c60ab00
--- /dev/null
+++ b/src/math/roundz-neon-addsub.c
@@ -0,0 +1,66 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__neon_addsub(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  // Addition of this number to a floating-point number x cause rounding of the result to an integer. Then this magic
+  // number is subtracted back from the result to get original x rounded to integer. This trick works only for
+  // 0 <= x < 2**24, but all numbers in 2**23 <= x < 2**24 range are integers, so we can further restrict it to
+  // 0 <= x < 2**23. Then the upper bound of the validity interval is conveniently the same as the magic number.
+  const float32x4_t vmagic_number = vmovq_n_f32(0x1.000000p+23f);
+  // Unit constant to decrement absolute values rounded "wrong way" (i.e. away from zero) in the round-to-nearest-even
+  // operation.
+  const float32x4_t vone = vmovq_n_f32(1.0f);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(input); input += 4;
+
+    // The rounding trick works only for x >= 0, so we compute absolute value of x, round it, and restore the sign in
+    // the end. This method works for round-towards-zero because it is an odd function.
+    const float32x4_t vabsx = vabsq_f32(vx);
+    // Compute bitmask for selection between the value rounded with addition-subtraction trick and the abs(x) value.
+    // We use the result of the addition-subtraction trick only on its validity interval, i.e. 0 <= abs(x) < 2**23.
+    // Note: we do vcaltq_f32(vmagic_number, vx) instead of vcltq_f32(vmagic_number, vabsx) to reduce dependency chain.
+    const uint32x4_t vrndmask = vcaltq_f32(vmagic_number, vx);
+
+    // Addition-subtraction trick with the magic number to cause rounding to the nearest-even integer for abs(x).
+    // Note: the result is valid only for 0 <= abs(x) < 2**23.
+    // Note: addition-subtraction implicitly converts SNaN inputs to QNaNs.
+    const float32x4_t vrndabsx = vsubq_f32(vaddq_f32(vabsx, vmagic_number), vmagic_number);
+    // Extract bitmask for the sign of x.
+    // The bitmask is 0x00000000 when x is positive (including +0) and 0x80000000 when x is negative (including -0).
+    const uint32x4_t vsignx = veorq_u32(vreinterpretq_u32_f32(vabsx), vreinterpretq_u32_f32(vx));
+
+    // Compute adjustment to be subtracted from the rounded-to-nearest-even abs(x) value.
+    // Adjustment is one if the rounded value is greater than the abs(x) value and zero otherwise (including NaN input).
+    const float32x4_t vadjustment =
+      vreinterpretq_f32_u32(vandq_u32(vone, vreinterpretq_u32_f32(vcgtq_f32(vrndabsx, vabsx))));
+    // Adjust abs(x) rounded to nearest-even via the addition-subtraction trick to get abs(x) rounded down.
+    // Note: subtraction implicitly converts SNaN inputs to QNaNs.
+    const float32x4_t vflrabsx = vsubq_f32(vrndabsx, vadjustment);
+
+    // Combine abs(x) rounded down via addition-subtraction trick with adjustment and the input x value.
+    // For 0.0 <= x < 2**23, the result is abs(x) rounded via addition-subtraction trick.
+    // For -2**23 < x <= -0.0, the result is abs(x) rounded via addition-subtraction trick with the sign of x.
+    // For NaN inputs, the result is x converted to QNaN as a side-effect of addition-subtraction and adjustment.
+    // For abs(x) >= 2**23, the result is x itself.
+    const float32x4_t vy = vbslq_f32(vorrq_u32(vrndmask, vsignx), vx, vflrabsx);
+
+    vst1q_f32(output, vy); output += 4;
+  }
+}
diff --git a/src/math/roundz-neon-cvt.c b/src/math/roundz-neon-cvt.c
new file mode 100644
index 0000000..99a7a7c
--- /dev/null
+++ b/src/math/roundz-neon-cvt.c
@@ -0,0 +1,49 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__neon_cvt(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  // Threshold of non-integral values in single-precision floating-point representation.
+  // All inputs above this threshold (by absolute value) are integer numbers.
+  const float32x4_t vintegral_threshold = vmovq_n_f32(0x1.000000p+23f);
+  // Mask for the sign of a single-precision floating-point number.
+  const uint32x4_t vsign_mask = vmovq_n_u32(UINT32_C(0x80000000));
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(input); input += 4;
+
+    // Convert floating-point value x to integer, with rounding towards zero, and then back to floating-point.
+    // Note: the result is valid only for abs(x) < 2**31, but we further restrict its use to 2**23.
+    const float32x4_t vrndx = vcvtq_f32_s32(vcvtq_s32_f32(vx));
+    // Extract the sign of the input.
+    // We need the sign to preserve negative zero value, which would otherwise get lost in FP->INT->FP conversion.
+    const uint32x4_t vsignx = vandq_u32(vreinterpretq_u32_f32(vrndx), vsign_mask);
+
+    // Compute bitmask for non-integral input.
+    // The bitmask is set to all ones when x is potentially non-integral, and we round it using FP->INT->FP conversion.
+    const uint32x4_t vrndmask = vcaltq_f32(vx, vintegral_threshold);
+
+    // Combine x rounded towardz zero via FP->INT->FP conversion and the input x value.
+    // For 0.0 <= x < 2**23, the result is x rounded via FP->INT->FP conversion.
+    // For -2**23 < x <= -0.0, the result is abs(x) rounded via FP->INT->FP conversion with the sign of x.
+    // For abs(x) >= 2**23 or NaN inputs, the result is x itself.
+    const float32x4_t vy = vbslq_f32(vbicq_u32(vrndmask, vsignx), vrndx, vx);
+
+    vst1q_f32(output, vy); output += 4;
+  }
+}
diff --git a/src/math/roundz-neonv8.c b/src/math/roundz-neonv8.c
new file mode 100644
index 0000000..92f31d5
--- /dev/null
+++ b/src/math/roundz-neonv8.c
@@ -0,0 +1,28 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__neonv8(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const float32x4_t vx = vld1q_f32(input); input += 4;
+
+    const float32x4_t vy = vrndq_f32(vx);
+
+    vst1q_f32(output, vy); output += 4;
+  }
+}
diff --git a/src/math/roundz-psimd-addsub.c b/src/math/roundz-psimd-addsub.c
new file mode 100644
index 0000000..14312e9
--- /dev/null
+++ b/src/math/roundz-psimd-addsub.c
@@ -0,0 +1,66 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <psimd.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__psimd_addsub(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  // Mask for the sign bit of a floating-point number.
+  const psimd_s32 vsign_mask = psimd_splat_s32(INT32_C(0x80000000));
+  // Addition of this number to a floating-point number x cause rounding of the result to an integer. Then this magic
+  // number is subtracted back from the result to get original x rounded to integer. This trick works only for
+  // 0 <= x < 2**24, but all numbers in 2**23 <= x < 2**24 range are integers, so we can further restrict it to
+  // 0 <= x < 2**23. Then the upper bound of the validity interval is conveniently the same as the magic number.
+  const psimd_f32 vmagic_number = psimd_splat_f32(0x1.000000p+23f);
+  // Unit constant to decrement absolute values rounded "wrong way" (i.e. away from zero) in the round-to-nearest-even
+  // operation.
+  const psimd_f32 vone = psimd_splat_f32(1.0f);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const psimd_f32 vx = psimd_load_f32(input);
+    input += 4;
+
+    // The rounding trick works only for x >= 0, so we compute absolute value of x, round it, and restore the sign in
+    // the end. This method works for round-toward-zero because it is an odd function.
+    const psimd_f32 vabsx = psimd_andnotmask_f32(vsign_mask, vx);
+
+    // Compute bitmask for the bits we want to copy from x. Other bits will be copied from the rounded abs(x).
+    // If abs(x) < 2**23 or x is NaN, we want the sign bit from x and the rest from the rounded abs(x).
+    // Otherwise (abs(x) >= 2**23), we want all bits from x.
+    const psimd_s32 vrndmask = vsign_mask | (vabsx >= vmagic_number);
+    // Addition-subtraction trick with the magic number to cause rounding to integer for abs(x).
+    // Note: the result is valid only for 0 <= abs(x) < 2**23.
+    // Note: addition-subtraction implicitly converts SNaN inputs to QNaNs.
+    const psimd_f32 vrndabsx = psimd_sub_f32(psimd_add_f32(vabsx, vmagic_number), vmagic_number);
+
+    // Compute adjustment to be subtracted from the rounded-to-nearest-even abs(x) value.
+    // Adjustment is one if the rounded value is greater than the abs(x) value and zero otherwise (including NaN input).
+    const psimd_f32 vadjustment = psimd_andmask_f32(vrndabsx > vabsx, vone);
+    // Adjust abs(x) rounded to nearest-even via the addition-subtraction trick to get abs(x) rounded down.
+    // Note: subtraction implicitly converts SNaN inputs to QNaNs.
+    const psimd_f32 vflrabsx = psimd_sub_f32(vrndabsx, vadjustment);
+
+    // Combine abs(x) rounded down via addition-subtraction trick with adjustment and the input x value.
+    // For abs(x) < 2**23, the result is abs(x) rounded via addition-subtraction trick with the sign of x.
+    // For NaN inputs, the result is x converted to QNaN as a side-effect of addition-subtraction and adjustment.
+    // For abs(x) >= 2**23, the result is x itself.
+    const psimd_f32 vy = psimd_blend_f32(vrndmask, vx, vflrabsx);
+
+    psimd_store_f32(output, vy);
+    output += 4;
+  }
+}
diff --git a/src/math/roundz-scalar-addsub.c b/src/math/roundz-scalar-addsub.c
new file mode 100644
index 0000000..f036d0f
--- /dev/null
+++ b/src/math/roundz-scalar-addsub.c
@@ -0,0 +1,55 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+#include <math.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__scalar_addsub(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % sizeof(float) == 0);
+
+  // Addition of this number to a floating-point number x cause rounding of the result to an integer. Then this magic
+  // number is subtracted back from the result to get original x rounded to integer. This trick works only for
+  // 0 <= x < 2**24, but all numbers in 2**23 <= x < 2**24 range are integers, so we can further restrict it to
+  // 0 <= x < 2**23. Then the upper bound of the validity interval is conveniently the same as the magic number.
+  const float vmagic_number = 0x1.000000p+23f;
+  // Unit constant to decrement absolute values rounded "wrong way" (i.e. away from zero) in the round-to-nearest-even
+  // operation.
+  const float vone = 1.0f;
+
+  for (; n != 0; n -= sizeof(float)) {
+    const float vx = *input++;
+
+    // The rounding trick works only for x >= 0, so we compute absolute value of x, round it, and restore the sign in
+    // the end. This method works for round-towards-zero because it is an odd function.
+    const float vabsx = fabsf(vx);
+    // Addition-subtraction trick with the magic number to cause rounding to the nearest-even integer for abs(x).
+    // Note: the result is valid only for 0 <= abs(x) < 2**23.
+    // Note: addition-subtraction implicitly converts SNaN inputs to QNaNs.
+    const float vrndabsx = (vabsx + vmagic_number) - vmagic_number;
+
+    // Adjust abs(x) rounded to nearest-even via the addition-subtraction trick to get abs(x) rounded down.
+    // Note: subtraction implicitly converts SNaN inputs to QNaNs.
+    const float vflrabsx = XNN_UNPREDICTABLE(vrndabsx <= vabsx) ? vrndabsx : vrndabsx - vone;
+
+    // Select between the abs(x) rounded down using addition-subtraction trick with adjustment and the abs(x) value.
+    // For abs(x) < 2**23, the result is abs(x) rounded via addition-subtraction trick.
+    // For abs(x) >= 2**23, the result is abs(x) itself (already an integer).
+    // For NaN inputs, the result is abs(x) converted to QNaN as a side-effect of addition-subtraction.
+    const float vabsy = XNN_UNPREDICTABLE(vabsx >= vmagic_number) ? vabsx : vflrabsx;
+    // Restore the sign of the rounded value.
+    const float vy = copysignf(vabsy, vx);
+
+    *output++ = vy;
+  }
+}
diff --git a/src/math/roundz-scalar-cvt.c b/src/math/roundz-scalar-cvt.c
new file mode 100644
index 0000000..2694600
--- /dev/null
+++ b/src/math/roundz-scalar-cvt.c
@@ -0,0 +1,42 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__scalar_cvt(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % sizeof(float) == 0);
+
+  // Threshold of non-integral values in single-precision floating-point representation.
+  // All inputs above this threshold (by absolute value) are integer numbers.
+  const float vintegral_threshold = 0x1.000000p+23f;
+
+  for (; n != 0; n -= sizeof(float)) {
+    const float vx = *input++;
+
+    // Convert floating-point value x to integer, with rounding towards zero, and then back to floating-point.
+    // Note: the result is valid only for abs(x) < 2**31, but we further restrict its use to 2**23.
+    const float vrndx = (float) (int32_t) vx;
+    // Compute abs(x) to check if the FP->INT->FP conversion result is valid.
+    const float vabsx = fabsf(vx);
+
+    // Select between the x rounded via FP->INT->FP conversion and the original x value.
+    const float vprey = XNN_UNPREDICTABLE(vabsx < vintegral_threshold) ? vrndx : vx;
+    // Restore the sign of -0.0f lost in the FP->INT->FP conversion.
+    const float vy = copysignf(vprey, vx);
+
+    *output++ = vy;
+  }
+}
diff --git a/src/math/roundz-sse-addsub.c b/src/math/roundz-sse-addsub.c
new file mode 100644
index 0000000..af1a0be
--- /dev/null
+++ b/src/math/roundz-sse-addsub.c
@@ -0,0 +1,66 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__sse_addsub(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  // Mask for all bits of a floating-point number except the sign bit.
+  const __m128 vnonsign_mask = _mm_set1_ps(math_nonsign_mask_f32());
+  // Addition of this number to a floating-point number x cause rounding of the result to an integer. Then this magic
+  // number is subtracted back from the result to get original x rounded to integer. This trick works only for
+  // 0 <= x < 2**24, but all numbers in 2**23 <= x < 2**24 range are integers, so we can further restrict it to
+  // 0 <= x < 2**23. Then the upper bound of the validity interval is conveniently the same as the magic number.
+  const __m128 vmagic_number = _mm_set1_ps(0x1.000000p+23f);
+  // Unit constant to decrement absolute values rounded "wrong way" (i.e. away from zero) in the round-to-nearest-even
+  // operation.
+  const __m128 vone = _mm_set1_ps(1.0f);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_load_ps(input);
+    input += 4;
+
+    // The rounding trick works only for x >= 0, so we compute absolute value of x, round it, and restore the sign in
+    // the end. This method works for round-towards-zero because it is an odd function.
+    const __m128 vabsx = _mm_and_ps(vx, vnonsign_mask);
+
+    // Compute bitmask for the bits we want to copy from the rounded abs(x). Other bits will be copied from x.
+    // If abs(x) >= 2**23, we want all bits from x.
+    // If abs(x) < 2**23 or x is NaN, we want all but the sign bit from the rounded abs(x) and the sign bit from x.
+    const __m128 vrndmask = _mm_andnot_ps(_mm_cmpge_ps(vabsx, vmagic_number), vnonsign_mask);
+    // Addition-subtraction trick with the magic number to cause rounding to the nearest-even integer for abs(x).
+    // Note: the result is valid only for 0 <= abs(x) < 2**23.
+    // Note: addition-subtraction implicitly converts SNaN inputs to QNaNs.
+    const __m128 vrndabsx = _mm_sub_ps(_mm_add_ps(vabsx, vmagic_number), vmagic_number);
+
+    // Compute adjustment to be subtracted from the rounded-to-nearest-even abs(x) value.
+    // Adjustment is one if the rounded value is greater than the abs(x) value and zero otherwise (including NaN input).
+    const __m128 vadjustment = _mm_and_ps(vone, _mm_cmpgt_ps(vrndabsx, vabsx));
+    // Adjust abs(x) rounded to nearest-even via the addition-subtraction trick to get abs(x) rounded down.
+    // Note: subtraction implicitly converts SNaN inputs to QNaNs.
+    const __m128 vflrabsx = _mm_sub_ps(vrndabsx, vadjustment);
+
+    // Combine abs(x) rounded down via addition-subtraction trick with adjustment and the input x value.
+    // For abs(x) < 2**23, the result is abs(x) rounded via addition-subtraction trick with the sign of x.
+    // For NaN inputs, the result is x converted to QNaN as a side-effect of addition-subtraction and adjustment.
+    // For abs(x) >= 2**23, the result is x itself.
+    const __m128 vy = _mm_or_ps(_mm_and_ps(vflrabsx, vrndmask), _mm_andnot_ps(vrndmask, vx));
+
+    _mm_store_ps(output, vy);
+    output += 4;
+  }
+}
diff --git a/src/math/roundz-sse2-cvt.c b/src/math/roundz-sse2-cvt.c
new file mode 100644
index 0000000..a4a3bde
--- /dev/null
+++ b/src/math/roundz-sse2-cvt.c
@@ -0,0 +1,54 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__sse2_cvt(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  // This magic number with a bit representation 0x80000000 serves two purposes:
+  // 1. Extract the sign of a floating-point number.
+  // 2. Check if the input to CVTTPS2DQ (_mm_cvttps_epi32) is out-of-range, which results in 0x80000000 output.
+  const __m128 vmagic = _mm_set1_ps(-0.0f);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_load_ps(input);
+    input += 4;
+
+    // Extract the sign of the input.
+    // We need the sign to preserve negative zero value, which would otherwise get lost in FP->INT->FP conversion.
+    const __m128 vsignx = _mm_and_ps(vx, vmagic);
+    // Convert floating-point value x to integer, with rounding towards zero.
+    // If x is beyond [-2**31, 2**31-1] range or x is NaN, the result is -2**31 (0x80000000).
+    const __m128i vintx = _mm_cvttps_epi32(vx);
+
+    // Compute bitmask for out-of-range conversion input.
+    // The bitmask is set to all ones when x is out-of-range for CVTTPS2DQ, and also when x == -2**31. The latter case
+    // is ok, because this x is already an integer, and can be passed to output as is.
+    const __m128 vrndmask = _mm_castsi128_ps(_mm_cmpeq_epi32(vintx, _mm_castps_si128(vmagic)));
+
+    // Convert integer back to floating-point.
+    // We binary OR the result with the sign of x to restore the sign of negative zero.
+    const __m128 vrndx = _mm_or_ps(_mm_cvtepi32_ps(vintx), vsignx);
+
+    // Combine x rounded via conversion to integer and the initial x value.
+    // For -2**31 < x < 2**31, the result is x rounded via conversion to integer.
+    // Otherwise (including NaN inputs), the result is x itself.
+    const __m128 vy = _mm_or_ps(_mm_and_ps(vx, vrndmask), _mm_andnot_ps(vrndmask, vrndx));
+
+    _mm_store_ps(output, vy);
+    output += 4;
+  }
+}
diff --git a/src/math/roundz-sse41.c b/src/math/roundz-sse41.c
new file mode 100644
index 0000000..78cbefe
--- /dev/null
+++ b/src/math/roundz-sse41.c
@@ -0,0 +1,30 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stddef.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/math-stubs.h>
+
+
+void xnn_math_f32_roundz__sse41(
+    size_t n,
+    const float* input,
+    float* output)
+{
+  assert(n % (4 * sizeof(float)) == 0);
+
+  for (; n != 0; n -= 4 * sizeof(float)) {
+    const __m128 vx = _mm_load_ps(input);
+    input += 4;
+
+    const __m128 vy = _mm_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+
+    _mm_store_ps(output, vy);
+    output += 4;
+  }
+}
diff --git a/src/xnnpack/math-stubs.h b/src/xnnpack/math-stubs.h
index 34773f3..a13dac5 100644
--- a/src/xnnpack/math-stubs.h
+++ b/src/xnnpack/math-stubs.h
@@ -44,6 +44,16 @@
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundne__psimd_addsub)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundne__scalar_addsub)
 
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__neon_addsub)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__neon_cvt)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__neonv8)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__sse_addsub)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__sse2_cvt)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__sse41)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__psimd_addsub)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__scalar_addsub)
+DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_roundz__scalar_cvt)
+
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__neonfma_lut64_p2)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__neonfma_p5)
 DECLARE_F32_UNARY_MATH_FUNCTION(xnn_math_f32_exp__sse2_p5)