QU8 VADD/VADDC microkernel benchmarks

PiperOrigin-RevId: 387702745
diff --git a/BUILD.bazel b/BUILD.bazel
index 9fb5d7f..f3ea058 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7279,6 +7279,24 @@
 )
 
 xnnpack_benchmark(
+    name = "qu8_vadd_bench",
+    srcs = [
+        "bench/qu8-vadd.cc",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
+    name = "qu8_vaddc_bench",
+    srcs = [
+        "bench/qu8-vaddc.cc",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
     name = "f16_igemm_bench",
     srcs = [
         "bench/f16-igemm.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 20c49a9..6208234 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7239,6 +7239,22 @@
   TARGET_INCLUDE_DIRECTORIES(qu8-requantization-bench PRIVATE . include src)
   TARGET_LINK_LIBRARIES(qu8-requantization-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
 
+  ADD_EXECUTABLE(qu8-vadd-bench bench/qu8-vadd.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(qu8-vadd-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(qu8-vadd-bench PRIVATE . include src)
+  TARGET_LINK_LIBRARIES(qu8-vadd-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+
+  ADD_EXECUTABLE(qu8-vaddc-bench bench/qu8-vaddc.cc $<TARGET_OBJECTS:all_microkernels>)
+  SET_TARGET_PROPERTIES(qu8-vaddc-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(qu8-vaddc-bench PRIVATE . include src)
+  TARGET_LINK_LIBRARIES(qu8-vaddc-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+
   ADD_EXECUTABLE(rounding-bench bench/rounding.cc $<TARGET_OBJECTS:all_microkernels>)
   SET_TARGET_PROPERTIES(rounding-bench PROPERTIES
     CXX_STANDARD 11
diff --git a/bench/qs8-vadd.cc b/bench/qs8-vadd.cc
index adcf05d..0823874 100644
--- a/bench/qs8-vadd.cc
+++ b/bench/qs8-vadd.cc
@@ -33,7 +33,9 @@
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-  auto i8rng = std::bind(std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
 
   std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
   std::vector<int8_t, AlignedAllocator<int8_t, 64>> b(num_elements);
@@ -182,25 +184,25 @@
 
   BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x8,
                     xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x16,
                     xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x16,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x24,
                     xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x24,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vadd, avx_mul32_ld32_x32,
                     xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x32,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
@@ -232,25 +234,25 @@
 
   BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x8,
                     xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x16,
                     xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x24,
                     xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x24,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vadd, sse41_mul32_ld32_x32,
                     xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x32,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
diff --git a/bench/qs8-vaddc.cc b/bench/qs8-vaddc.cc
index b9e0d39..2d40bde 100644
--- a/bench/qs8-vaddc.cc
+++ b/bench/qs8-vaddc.cc
@@ -33,7 +33,9 @@
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-  auto i8rng = std::bind(std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+  auto i8rng = std::bind(
+    std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
+    std::ref(rng));
 
   std::vector<int8_t, AlignedAllocator<int8_t, 64>> a(num_elements);
   std::vector<int8_t, AlignedAllocator<int8_t, 64>> sum(num_elements);
@@ -181,25 +183,25 @@
 
   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x8,
                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x16,
                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x16,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x24,
                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x24,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vaddc, avx_mul32_ld32_x32,
                     xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x32,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckAVX)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
@@ -231,25 +233,25 @@
 
   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x8,
                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x8,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x16,
                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x16,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x24,
                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x24,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
   BENCHMARK_CAPTURE(qs8_vaddc, sse41_mul32_ld32_x32,
                     xnn_qs8_vaddc_minmax_ukernel__sse41_mul32_ld32_x32,
-                    xnn_init_qs8_add_minmax_sse4_mul16_params,
+                    xnn_init_qs8_add_minmax_sse4_mul32_params,
                     benchmark::utils::CheckSSE41)
     ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
     ->UseRealTime();
diff --git a/bench/qu8-vadd.cc b/bench/qu8-vadd.cc
new file mode 100644
index 0000000..a9378c3
--- /dev/null
+++ b/bench/qu8-vadd.cc
@@ -0,0 +1,219 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/params.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/vadd.h>
+
+
+static void qu8_vadd(
+  benchmark::State& state,
+  xnn_qu8_vadd_minmax_ukernel_function vadd,
+  xnn_init_qu8_add_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+
+  const size_t num_elements = state.range(0);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<uint32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+
+  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> a(num_elements);
+  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> b(num_elements);
+  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> sum(num_elements);
+  std::generate(a.begin(), a.end(), std::ref(u8rng));
+  std::generate(b.begin(), b.end(), std::ref(u8rng));
+
+  union xnn_qu8_add_minmax_params params;
+  init_params(&params,
+    1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
+    0.5f /* a-output scale */, 0.75f /* b-output scale */,
+    std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
+  for (auto _ : state) {
+    vadd(num_elements * sizeof(int8_t), a.data(), b.data(), sum.data(), &params);
+  }
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t num_elements_per_iteration = num_elements;
+  state.counters["num_elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 3 * num_elements * sizeof(int8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(qu8_vadd, neon_ld64_x8,
+                    xnn_qu8_vadd_minmax_ukernel__neon_ld64_x8,
+                    xnn_init_qu8_add_minmax_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, neon_ld64_x16,
+                    xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
+                    xnn_init_qu8_add_minmax_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(qu8_vadd, avx512skx_mul32_ld128_x16,
+                    xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
+                    xnn_init_qu8_add_minmax_avx512_params,
+                    benchmark::utils::CheckAVX512SKX)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, avx512skx_mul32_ld128_x32,
+                    xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x32,
+                    xnn_init_qu8_add_minmax_avx512_params,
+                    benchmark::utils::CheckAVX512SKX)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, avx2_mul32_ld64_x8,
+                    xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x8,
+                    xnn_init_qu8_add_minmax_avx2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, avx2_mul32_ld64_x16,
+                    xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
+                    xnn_init_qu8_add_minmax_avx2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, xop_mul32_ld32_x8,
+                    xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckXOP)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, xop_mul32_ld32_x16,
+                    xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x16,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckXOP)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, avx_mul16_ld64_x8,
+                    xnn_qu8_vadd_minmax_ukernel__avx_mul16_ld64_x8,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, avx_mul16_ld64_x16,
+                    xnn_qu8_vadd_minmax_ukernel__avx_mul16_ld64_x16,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, avx_mul32_ld32_x8,
+                    xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, avx_mul32_ld32_x16,
+                    xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x16,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, sse41_mul16_ld64_x8,
+                    xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, sse41_mul16_ld64_x16,
+                    xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x16,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, sse41_mul32_ld32_x8,
+                    xnn_qu8_vadd_minmax_ukernel__sse41_mul32_ld32_x8,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, sse41_mul32_ld32_x16,
+                    xnn_qu8_vadd_minmax_ukernel__sse41_mul32_ld32_x16,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vadd, sse2_mul16_ld64_x8,
+                    xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
+                    xnn_init_qu8_add_minmax_sse2_params)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, sse2_mul16_ld64_x16,
+                    xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x16,
+                    xnn_init_qu8_add_minmax_sse2_params)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD
+  BENCHMARK_CAPTURE(qu8_vadd, wasmsimd_x8,
+                    xnn_qu8_vadd_minmax_ukernel__wasmsimd_x8,
+                    xnn_init_qu8_add_minmax_wasmsimd_params)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vadd, wasmsimd_x16,
+                    xnn_qu8_vadd_minmax_ukernel__wasmsimd_x16,
+                    xnn_init_qu8_add_minmax_wasmsimd_params)
+    ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_WASMSIMD
+
+BENCHMARK_CAPTURE(qu8_vadd, scalar_x1,
+                  xnn_qu8_vadd_minmax_ukernel__scalar_x1,
+                  xnn_init_qu8_add_minmax_scalar_params)
+  ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(qu8_vadd, scalar_x2,
+                  xnn_qu8_vadd_minmax_ukernel__scalar_x2,
+                  xnn_init_qu8_add_minmax_scalar_params)
+  ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(qu8_vadd, scalar_x4,
+                  xnn_qu8_vadd_minmax_ukernel__scalar_x4,
+                  xnn_init_qu8_add_minmax_scalar_params)
+  ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/bench/qu8-vaddc.cc b/bench/qu8-vaddc.cc
new file mode 100644
index 0000000..119508b
--- /dev/null
+++ b/bench/qu8-vaddc.cc
@@ -0,0 +1,218 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/params.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/vadd.h>
+
+
+static void qu8_vaddc(
+  benchmark::State& state,
+  xnn_qu8_vadd_minmax_ukernel_function vaddc,
+  xnn_init_qu8_add_minmax_params_fn init_params,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+
+  const size_t num_elements = state.range(0);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto u8rng = std::bind(
+    std::uniform_int_distribution<uint32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
+    std::ref(rng));
+
+  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> a(num_elements);
+  std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> sum(num_elements);
+  std::generate(a.begin(), a.end(), std::ref(u8rng));
+  const uint8_t b = u8rng();
+
+  union xnn_qu8_add_minmax_params params;
+  init_params(&params,
+    1 /* a zero point */, 1 /* b zero point */, 1 /* output zero point */,
+    0.5f /* a-output scale */, 0.75f /* b-output scale */,
+    std::numeric_limits<int8_t>::min() + 1, std::numeric_limits<int8_t>::max() - 1);
+  for (auto _ : state) {
+    vaddc(num_elements * sizeof(int8_t), a.data(), &b, sum.data(), &params);
+  }
+
+  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+  if (cpu_frequency != 0) {
+    state.counters["cpufreq"] = cpu_frequency;
+  }
+
+  const size_t num_elements_per_iteration = num_elements;
+  state.counters["num_elements"] =
+    benchmark::Counter(uint64_t(state.iterations()) * num_elements_per_iteration, benchmark::Counter::kIsRate);
+
+  const size_t bytes_per_iteration = 2 * num_elements * sizeof(int8_t);
+  state.counters["bytes"] =
+    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  BENCHMARK_CAPTURE(qu8_vaddc, neon_ld64_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x8,
+                    xnn_init_qu8_add_minmax_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, neon_ld64_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
+                    xnn_init_qu8_add_minmax_neon_params,
+                    benchmark::utils::CheckNEON)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  BENCHMARK_CAPTURE(qu8_vaddc, avx512skx_mul32_ld128_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
+                    xnn_init_qu8_add_minmax_avx512_params,
+                    benchmark::utils::CheckAVX512SKX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, avx512skx_mul32_ld128_x32,
+                    xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x32,
+                    xnn_init_qu8_add_minmax_avx512_params,
+                    benchmark::utils::CheckAVX512SKX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, avx2_mul32_ld64_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x8,
+                    xnn_init_qu8_add_minmax_avx2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, avx2_mul32_ld64_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
+                    xnn_init_qu8_add_minmax_avx2_params,
+                    benchmark::utils::CheckAVX2)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, xop_mul32_ld32_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckXOP)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, xop_mul32_ld32_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x16,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckXOP)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, avx_mul16_ld64_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__avx_mul16_ld64_x8,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, avx_mul16_ld64_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__avx_mul16_ld64_x16,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, avx_mul32_ld32_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, avx_mul32_ld32_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x16,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckAVX)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, sse41_mul16_ld64_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, sse41_mul16_ld64_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16,
+                    xnn_init_qu8_add_minmax_sse2_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, sse41_mul32_ld32_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__sse41_mul32_ld32_x8,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, sse41_mul32_ld32_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__sse41_mul32_ld32_x16,
+                    xnn_init_qu8_add_minmax_sse4_params,
+                    benchmark::utils::CheckSSE41)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+
+  BENCHMARK_CAPTURE(qu8_vaddc, sse2_mul16_ld64_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
+                    xnn_init_qu8_add_minmax_sse2_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, sse2_mul16_ld64_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16,
+                    xnn_init_qu8_add_minmax_sse2_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD
+  BENCHMARK_CAPTURE(qu8_vaddc, wasmsimd_x8,
+                    xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x8,
+                    xnn_init_qu8_add_minmax_wasmsimd_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+  BENCHMARK_CAPTURE(qu8_vaddc, wasmsimd_x16,
+                    xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x16,
+                    xnn_init_qu8_add_minmax_wasmsimd_params)
+    ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+    ->UseRealTime();
+#endif  // XNN_ARCH_WASMSIMD
+
+BENCHMARK_CAPTURE(qu8_vaddc, scalar_x1,
+                  xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
+                  xnn_init_qu8_add_minmax_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(qu8_vaddc, scalar_x2,
+                  xnn_qu8_vaddc_minmax_ukernel__scalar_x2,
+                  xnn_init_qu8_add_minmax_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+BENCHMARK_CAPTURE(qu8_vaddc, scalar_x4,
+                  xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
+                  xnn_init_qu8_add_minmax_scalar_params)
+  ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+  ->UseRealTime();
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif