FP16 DWCONV microkernel

PiperOrigin-RevId: 314854152
diff --git a/BUILD.bazel b/BUILD.bazel
index 4bb4c83..5c95329 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -998,6 +998,18 @@
 AARCH64_NEONFP16ARITH_UKERNELS = [
     "src/f16-clamp/gen/neonfp16arith-x16.c",
     "src/f16-clamp/gen/neonfp16arith-x8.c",
+    "src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c",
+    "src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c",
+    "src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c",
+    "src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c",
+    "src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c",
+    "src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c",
+    "src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c",
+    "src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c",
+    "src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c",
+    "src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c",
+    "src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c",
+    "src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c",
     "src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c",
     "src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c",
     "src/f16-hswish/gen/neonfp16arith-x16.c",
@@ -2906,6 +2918,17 @@
 )
 
 xnnpack_benchmark(
+    name = "f16_dwconv_bench",
+    srcs = [
+        "bench/f16-dwconv.cc",
+        "bench/dwconv.h",
+        "bench/google/dwconv.h",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + MICROKERNEL_BENCHMARK_HDRS,
+    deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+)
+
+xnnpack_benchmark(
     name = "f32_dwconv_bench",
     srcs = [
         "bench/f32-dwconv.cc",
@@ -3433,6 +3456,16 @@
 )
 
 xnnpack_unit_test(
+    name = "f16_dwconv_minmax_test",
+    srcs = [
+        "test/f16-dwconv-minmax.cc",
+        "test/dwconv-microkernel-tester.h",
+        "src/xnnpack/AlignedAllocator.h",
+    ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "f32_dwconv_minmax_test",
     srcs = [
         "test/f32-dwconv-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed8979d..dd9a8d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1001,6 +1001,18 @@
 SET(XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-clamp/gen/neonfp16arith-x16.c
   src/f16-clamp/gen/neonfp16arith-x8.c
+  src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
+  src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
+  src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
+  src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
+  src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
+  src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
+  src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
+  src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
+  src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
+  src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
+  src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
+  src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
   src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
   src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
   src/f16-hswish/gen/neonfp16arith-x16.c
@@ -2304,6 +2316,15 @@
   TARGET_LINK_LIBRARIES(f32-dwconv-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(f32-dwconv-test f32-dwconv-test)
 
+  ADD_EXECUTABLE(f16-dwconv-minmax-test test/f16-dwconv-minmax.cc)
+  SET_TARGET_PROPERTIES(f16-dwconv-minmax-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f16-dwconv-minmax-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f16-dwconv-minmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f16-dwconv-minmax-test f16-dwconv-minmax-test)
+
   ADD_EXECUTABLE(f32-dwconv-minmax-test test/f32-dwconv-minmax.cc)
   SET_TARGET_PROPERTIES(f32-dwconv-minmax-test PROPERTIES
     CXX_STANDARD 11
@@ -3223,6 +3244,15 @@
   TARGET_INCLUDE_DIRECTORIES(f32-dwconv-chw-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
   TARGET_LINK_LIBRARIES(f32-dwconv-chw-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
 
+  ADD_EXECUTABLE(f16-dwconv-bench bench/f16-dwconv.cc)
+  SET_TARGET_PROPERTIES(f16-dwconv-bench PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS YES)
+  TARGET_INCLUDE_DIRECTORIES(f16-dwconv-bench PRIVATE src)
+  TARGET_INCLUDE_DIRECTORIES(f16-dwconv-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+  TARGET_LINK_LIBRARIES(f16-dwconv-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
+
   ADD_EXECUTABLE(f32-dwconv-bench bench/f32-dwconv.cc)
   SET_TARGET_PROPERTIES(f32-dwconv-bench PROPERTIES
     CXX_STANDARD 11
diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc
new file mode 100644
index 0000000..926a914
--- /dev/null
+++ b/bench/f16-dwconv.cc
@@ -0,0 +1,234 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <cpuinfo.h>
+
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/dwconv.h"
+#include "bench/utils.h"
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/dwconv.h>
+#include <xnnpack/indirection.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/params.h>
+
+
+static void DWConvBenchmark(benchmark::State& state,
+  xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv,
+  uint32_t cr, uint32_t kr,
+  benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+  if (!cpuinfo_initialize()) {
+    state.SkipWithError("cpuinfo initialization failed");
+    return;
+  }
+  if (isa_check && !isa_check(state)) {
+    return;
+  }
+
+  const size_t input_height = state.range(0);
+  const size_t input_width = state.range(1);
+  const size_t kernel_height = state.range(2);
+  const size_t kernel_width = state.range(3);
+  const size_t padding_height = state.range(4);
+  const size_t padding_width = state.range(5);
+  const size_t subsampling = state.range(6);
+  const size_t dilation = state.range(7);
+  const size_t channels = state.range(8);
+
+  const size_t kernel_size = kernel_height * kernel_width;
+  if (kernel_size != kr) {
+    state.SkipWithError("kernel size mismatch");
+    return;
+  }
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
+  auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+  const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+  const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+  const size_t padding_left = padding_width / 2;
+  const size_t padding_top = padding_height / 2;
+  const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+  const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+  const size_t output_size = output_height * output_width;
+  const size_t step_width = dilation == 1 ? subsampling : kernel_width;
+  const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
+
+  const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, cr);
+
+  std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
+  std::generate(a.begin(), a.end(), std::ref(f16rng));
+  std::vector<uint16_t> k(channels * kernel_height * kernel_width);
+  std::generate(k.begin(), k.end(), std::ref(f16rng));
+  std::vector<uint16_t> b(channels);
+  std::generate(b.begin(), b.end(), std::ref(f16rng));
+
+  std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+
+  const size_t w_elements = (kernel_size + 1) * c_stride;
+  const size_t i_elements = output_height * step_height;
+  const size_t c_elements = output_size * channels;
+  const size_t num_buffers = 1 +
+    benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+      sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+
+  std::vector<uint16_t, AlignedAllocator<uint16_t, 32>> w(w_elements * num_buffers);
+  std::fill(w.begin(), w.end(), 0.0f);
+  xnn_pack_f16_dwconv_ghw_w(kernel_height, kernel_width, channels, cr,
+      k.data(), b.data(), w.data());
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+  }
+
+  std::vector<const uint16_t*> i(i_elements * num_buffers);
+  xnn_operator convolution_op = { };
+  convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
+  convolution_op.input              = a.data();
+  convolution_op.input_pixel_stride = channels;
+  convolution_op.zero_buffer        = z.data();
+  convolution_op.batch_size         = 1;
+  convolution_op.input_height       = input_height;
+  convolution_op.input_width        = input_width;
+  convolution_op.output_height      = output_height;
+  convolution_op.output_width       = output_width;
+  convolution_op.kernel_height      = kernel_height;
+  convolution_op.kernel_width       = kernel_width;
+  convolution_op.stride_height      = subsampling;
+  convolution_op.stride_width       = subsampling;
+  convolution_op.dilation_height    = dilation;
+  convolution_op.dilation_width     = dilation;
+  convolution_op.padding_top        = padding_top;
+  convolution_op.padding_left       = padding_left;
+
+  xnn_indirection_init_dwconv2d(&convolution_op, 0, step_height, step_width, 1 /* log2(sizeof(uint16_t)) */);
+  for (size_t n = 1; n < num_buffers; n++) {
+    std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+  }
+
+  std::vector<uint16_t> c(c_elements * num_buffers);
+  std::fill(c.begin(), c.end(), std::nanf(""));
+
+  xnn_f16_minmax_params params =
+    xnn_init_f16_minmax_params(-std::numeric_limits<uint16_t>::infinity(), +std::numeric_limits<uint16_t>::infinity());
+
+  size_t buffer_index = 0;
+  for (auto _ : state) {
+    state.PauseTiming();
+    benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+    buffer_index = (buffer_index + 1) % num_buffers;
+    state.ResumeTiming();
+
+    for (uint32_t y = 0; y < output_height; y++) {
+      dwconv(channels, output_width,
+        reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
+        w.data() + buffer_index * w_elements,
+        c.data() + buffer_index * c_elements + y * output_width * channels,
+        kernel_height * step_width * sizeof(void*), 0,
+        0, z.data(), &params);
+    }
+  }
+
+  state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+  state.counters["FLOPS"] = benchmark::Counter(
+    uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
+    benchmark::Counter::kIsRate);
+
+  state.counters["BYTES"] = benchmark::Counter(
+    uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
+    benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM64
+  static void f16_dwconv_8x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2, 8, 25,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_8x25__neonfp16arith(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith, 8, 25,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_8x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2, 8, 4,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_8x4__neonfp16arith(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith, 8, 4,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_8x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2, 8, 9,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_8x9__neonfp16arith(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith, 8, 9,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_16x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2, 16, 25,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_16x25__neonfp16arith(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith, 16, 25,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_16x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2, 16, 4,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_16x4__neonfp16arith(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith, 16, 4,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_16x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2, 16, 9,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  static void f16_dwconv_16x9__neonfp16arith(benchmark::State& state, const char* net) {
+    DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith, 16, 9,
+    benchmark::utils::CheckNEONFP16ARITH);
+  }
+
+  BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_8x4__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8x4__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_8x9__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_8x9__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_16x25__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_16x25__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_16x4__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_16x4__neonfp16arith)
+  BENCHMARK_DWCONV(f16_dwconv_16x9__neonfp16arith_acc2)
+  BENCHMARK_DWCONV(f16_dwconv_16x9__neonfp16arith)
+#endif  // XNN_ARCH_ARM64
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/scripts/generate-f16-dwconv.sh b/scripts/generate-f16-dwconv.sh
new file mode 100755
index 0000000..c81a818
--- /dev/null
+++ b/scripts/generate-f16-dwconv.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Copyright 2020 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+################################### ARM NEON ##################################
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
+
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
+
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8  -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
+
+################################## Unit tests #################################
+tools/generate-dwconv-test.py --spec test/f16-dwconv-minmax.yaml --output test/f16-dwconv-minmax.cc
diff --git a/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..162f3e6
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,596 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+    const __fp16* i9 = (const __fp16*) input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+      i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+    }
+    const __fp16* i10 = (const __fp16*) input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+      i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+    }
+    const __fp16* i11 = (const __fp16*) input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+      i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+    }
+    const __fp16* i12 = (const __fp16*) input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+      i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+    }
+    const __fp16* i13 = (const __fp16*) input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+      i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+    }
+    const __fp16* i14 = (const __fp16*) input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+      i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+    }
+    const __fp16* i15 = (const __fp16*) input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+      i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+    }
+    const __fp16* i16 = (const __fp16*) input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+      i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+    }
+    const __fp16* i17 = (const __fp16*) input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+      i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+    }
+    const __fp16* i18 = (const __fp16*) input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+      i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+    }
+    const __fp16* i19 = (const __fp16*) input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+      i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+    }
+    const __fp16* i20 = (const __fp16*) input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+      i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+    }
+    const __fp16* i21 = (const __fp16*) input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+      i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+    }
+    const __fp16* i22 = (const __fp16*) input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+      i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+    }
+    const __fp16* i23 = (const __fp16*) input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+      i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+    }
+    const __fp16* i24 = (const __fp16*) input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+      i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+      float16x8_t vacc89ABCDEFp1 = vmulq_f16(vi1x89ABCDEF, vk1x89ABCDEF);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi3x89ABCDEF, vk3x89ABCDEF);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi5x89ABCDEF, vk5x89ABCDEF);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi7x89ABCDEF, vk7x89ABCDEF);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vi9x89ABCDEF = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk9x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi9x89ABCDEF, vk9x89ABCDEF);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vi10x89ABCDEF = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk10x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi10x89ABCDEF, vk10x89ABCDEF);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vi11x89ABCDEF = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk11x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi11x89ABCDEF, vk11x89ABCDEF);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vi12x89ABCDEF = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk12x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi12x89ABCDEF, vk12x89ABCDEF);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vi13x89ABCDEF = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk13x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi13x89ABCDEF, vk13x89ABCDEF);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vi14x89ABCDEF = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk14x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi14x89ABCDEF, vk14x89ABCDEF);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vi15x89ABCDEF = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk15x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi15x89ABCDEF, vk15x89ABCDEF);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vi16x89ABCDEF = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk16x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi16x89ABCDEF, vk16x89ABCDEF);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vi17x89ABCDEF = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk17x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi17x89ABCDEF, vk17x89ABCDEF);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vi18x89ABCDEF = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk18x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi18x89ABCDEF, vk18x89ABCDEF);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vi19x89ABCDEF = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk19x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi19x89ABCDEF, vk19x89ABCDEF);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vi20x89ABCDEF = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk20x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi20x89ABCDEF, vk20x89ABCDEF);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vi21x89ABCDEF = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk21x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi21x89ABCDEF, vk21x89ABCDEF);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vi22x89ABCDEF = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk22x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi22x89ABCDEF, vk22x89ABCDEF);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vi23x89ABCDEF = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk23x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi23x89ABCDEF, vk23x89ABCDEF);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vi24x89ABCDEF = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk24x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi24x89ABCDEF, vk24x89ABCDEF);
+
+      // Add up all accumulators to vacc0123456789ABCDEFp0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+      vacc89ABCDEFp0 = vaddq_f16(vacc89ABCDEFp0, vacc89ABCDEFp1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+      vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+      vst1q_f16(output, vacc89ABCDEF); output += 8;
+    }
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vk9x01234567 = vld1q_f16(w + 152);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vk10x01234567 = vld1q_f16(w + 168);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vk11x01234567 = vld1q_f16(w + 184);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vk12x01234567 = vld1q_f16(w + 200);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vk13x01234567 = vld1q_f16(w + 216);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vk14x01234567 = vld1q_f16(w + 232);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vk15x01234567 = vld1q_f16(w + 248);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vk16x01234567 = vld1q_f16(w + 264);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vk17x01234567 = vld1q_f16(w + 280);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vk18x01234567 = vld1q_f16(w + 296);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vk19x01234567 = vld1q_f16(w + 312);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vk20x01234567 = vld1q_f16(w + 328);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vk21x01234567 = vld1q_f16(w + 344);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vk22x01234567 = vld1q_f16(w + 360);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vk23x01234567 = vld1q_f16(w + 376);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vk24x01234567 = vld1q_f16(w + 392);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9);
+      const float16x8_t vk9x01234567 = vld1q_f16(w + 160);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10);
+      const float16x8_t vk10x01234567 = vld1q_f16(w + 176);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11);
+      const float16x8_t vk11x01234567 = vld1q_f16(w + 192);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12);
+      const float16x8_t vk12x01234567 = vld1q_f16(w + 208);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13);
+      const float16x8_t vk13x01234567 = vld1q_f16(w + 224);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14);
+      const float16x8_t vk14x01234567 = vld1q_f16(w + 240);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15);
+      const float16x8_t vk15x01234567 = vld1q_f16(w + 256);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16);
+      const float16x8_t vk16x01234567 = vld1q_f16(w + 272);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17);
+      const float16x8_t vk17x01234567 = vld1q_f16(w + 288);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18);
+      const float16x8_t vk18x01234567 = vld1q_f16(w + 304);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19);
+      const float16x8_t vk19x01234567 = vld1q_f16(w + 320);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20);
+      const float16x8_t vk20x01234567 = vld1q_f16(w + 336);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21);
+      const float16x8_t vk21x01234567 = vld1q_f16(w + 352);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22);
+      const float16x8_t vk22x01234567 = vld1q_f16(w + 368);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23);
+      const float16x8_t vk23x01234567 = vld1q_f16(w + 384);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24);
+      const float16x8_t vk24x01234567 = vld1q_f16(w + 400);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
new file mode 100644
index 0000000..83dd9d8
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
@@ -0,0 +1,589 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+    const __fp16* i9 = (const __fp16*) input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+      i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+    }
+    const __fp16* i10 = (const __fp16*) input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+      i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+    }
+    const __fp16* i11 = (const __fp16*) input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+      i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+    }
+    const __fp16* i12 = (const __fp16*) input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+      i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+    }
+    const __fp16* i13 = (const __fp16*) input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+      i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+    }
+    const __fp16* i14 = (const __fp16*) input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+      i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+    }
+    const __fp16* i15 = (const __fp16*) input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+      i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+    }
+    const __fp16* i16 = (const __fp16*) input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+      i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+    }
+    const __fp16* i17 = (const __fp16*) input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+      i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+    }
+    const __fp16* i18 = (const __fp16*) input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+      i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+    }
+    const __fp16* i19 = (const __fp16*) input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+      i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+    }
+    const __fp16* i20 = (const __fp16*) input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+      i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+    }
+    const __fp16* i21 = (const __fp16*) input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+      i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+    }
+    const __fp16* i22 = (const __fp16*) input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+      i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+    }
+    const __fp16* i23 = (const __fp16*) input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+      i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+    }
+    const __fp16* i24 = (const __fp16*) input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+      i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi1x89ABCDEF, vk1x89ABCDEF);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi3x89ABCDEF, vk3x89ABCDEF);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi5x89ABCDEF, vk5x89ABCDEF);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi7x89ABCDEF, vk7x89ABCDEF);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vi9x89ABCDEF = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk9x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi9x89ABCDEF, vk9x89ABCDEF);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vi10x89ABCDEF = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk10x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi10x89ABCDEF, vk10x89ABCDEF);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vi11x89ABCDEF = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk11x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi11x89ABCDEF, vk11x89ABCDEF);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vi12x89ABCDEF = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk12x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi12x89ABCDEF, vk12x89ABCDEF);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vi13x89ABCDEF = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk13x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi13x89ABCDEF, vk13x89ABCDEF);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vi14x89ABCDEF = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk14x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi14x89ABCDEF, vk14x89ABCDEF);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vi15x89ABCDEF = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk15x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi15x89ABCDEF, vk15x89ABCDEF);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vi16x89ABCDEF = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk16x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi16x89ABCDEF, vk16x89ABCDEF);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vi17x89ABCDEF = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk17x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi17x89ABCDEF, vk17x89ABCDEF);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vi18x89ABCDEF = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk18x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi18x89ABCDEF, vk18x89ABCDEF);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vi19x89ABCDEF = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk19x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi19x89ABCDEF, vk19x89ABCDEF);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vi20x89ABCDEF = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk20x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi20x89ABCDEF, vk20x89ABCDEF);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vi21x89ABCDEF = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk21x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi21x89ABCDEF, vk21x89ABCDEF);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vi22x89ABCDEF = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk22x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi22x89ABCDEF, vk22x89ABCDEF);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vi23x89ABCDEF = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk23x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi23x89ABCDEF, vk23x89ABCDEF);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vi24x89ABCDEF = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk24x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi24x89ABCDEF, vk24x89ABCDEF);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+      vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+      vst1q_f16(output, vacc89ABCDEF); output += 8;
+    }
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vk9x01234567 = vld1q_f16(w + 152);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vk10x01234567 = vld1q_f16(w + 168);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vk11x01234567 = vld1q_f16(w + 184);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vk12x01234567 = vld1q_f16(w + 200);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vk13x01234567 = vld1q_f16(w + 216);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vk14x01234567 = vld1q_f16(w + 232);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vk15x01234567 = vld1q_f16(w + 248);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vk16x01234567 = vld1q_f16(w + 264);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vk17x01234567 = vld1q_f16(w + 280);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vk18x01234567 = vld1q_f16(w + 296);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vk19x01234567 = vld1q_f16(w + 312);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vk20x01234567 = vld1q_f16(w + 328);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vk21x01234567 = vld1q_f16(w + 344);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vk22x01234567 = vld1q_f16(w + 360);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vk23x01234567 = vld1q_f16(w + 376);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vk24x01234567 = vld1q_f16(w + 392);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9);
+      const float16x8_t vk9x01234567 = vld1q_f16(w + 160);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10);
+      const float16x8_t vk10x01234567 = vld1q_f16(w + 176);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11);
+      const float16x8_t vk11x01234567 = vld1q_f16(w + 192);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12);
+      const float16x8_t vk12x01234567 = vld1q_f16(w + 208);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13);
+      const float16x8_t vk13x01234567 = vld1q_f16(w + 224);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14);
+      const float16x8_t vk14x01234567 = vld1q_f16(w + 240);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15);
+      const float16x8_t vk15x01234567 = vld1q_f16(w + 256);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16);
+      const float16x8_t vk16x01234567 = vld1q_f16(w + 272);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17);
+      const float16x8_t vk17x01234567 = vld1q_f16(w + 288);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18);
+      const float16x8_t vk18x01234567 = vld1q_f16(w + 304);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19);
+      const float16x8_t vk19x01234567 = vld1q_f16(w + 320);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20);
+      const float16x8_t vk20x01234567 = vld1q_f16(w + 336);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21);
+      const float16x8_t vk21x01234567 = vld1q_f16(w + 352);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22);
+      const float16x8_t vk22x01234567 = vld1q_f16(w + 368);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23);
+      const float16x8_t vk23x01234567 = vld1q_f16(w + 384);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24);
+      const float16x8_t vk24x01234567 = vld1q_f16(w + 400);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..016daba
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,176 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+      float16x8_t vacc89ABCDEFp1 = vmulq_f16(vi1x89ABCDEF, vk1x89ABCDEF);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi3x89ABCDEF, vk3x89ABCDEF);
+
+      // Add up all accumulators to vacc0123456789ABCDEFp0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+      vacc89ABCDEFp0 = vaddq_f16(vacc89ABCDEFp0, vacc89ABCDEFp1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+      vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+      vst1q_f16(output, vacc89ABCDEF); output += 8;
+    }
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
new file mode 100644
index 0000000..a72d2e9
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
@@ -0,0 +1,169 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi1x89ABCDEF, vk1x89ABCDEF);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi3x89ABCDEF, vk3x89ABCDEF);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+      vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+      vst1q_f16(output, vacc89ABCDEF); output += 8;
+    }
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..68687d5
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,276 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+      float16x8_t vacc89ABCDEFp1 = vmulq_f16(vi1x89ABCDEF, vk1x89ABCDEF);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi3x89ABCDEF, vk3x89ABCDEF);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi5x89ABCDEF, vk5x89ABCDEF);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+      vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi7x89ABCDEF, vk7x89ABCDEF);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+      // Add up all accumulators to vacc0123456789ABCDEFp0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+      vacc89ABCDEFp0 = vaddq_f16(vacc89ABCDEFp0, vacc89ABCDEFp1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+      vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+      vst1q_f16(output, vacc89ABCDEF); output += 8;
+    }
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
new file mode 100644
index 0000000..e322eda
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
@@ -0,0 +1,269 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi1x89ABCDEF, vk1x89ABCDEF);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi3x89ABCDEF, vk3x89ABCDEF);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi5x89ABCDEF, vk5x89ABCDEF);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi7x89ABCDEF, vk7x89ABCDEF);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+      vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+      vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+      vst1q_f16(output, vacc89ABCDEF); output += 8;
+    }
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..a67bb8d
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,404 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+    const __fp16* i9 = (const __fp16*) input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+      i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+    }
+    const __fp16* i10 = (const __fp16*) input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+      i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+    }
+    const __fp16* i11 = (const __fp16*) input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+      i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+    }
+    const __fp16* i12 = (const __fp16*) input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+      i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+    }
+    const __fp16* i13 = (const __fp16*) input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+      i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+    }
+    const __fp16* i14 = (const __fp16*) input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+      i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+    }
+    const __fp16* i15 = (const __fp16*) input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+      i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+    }
+    const __fp16* i16 = (const __fp16*) input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+      i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+    }
+    const __fp16* i17 = (const __fp16*) input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+      i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+    }
+    const __fp16* i18 = (const __fp16*) input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+      i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+    }
+    const __fp16* i19 = (const __fp16*) input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+      i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+    }
+    const __fp16* i20 = (const __fp16*) input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+      i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+    }
+    const __fp16* i21 = (const __fp16*) input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+      i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+    }
+    const __fp16* i22 = (const __fp16*) input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+      i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+    }
+    const __fp16* i23 = (const __fp16*) input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+      i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+    }
+    const __fp16* i24 = (const __fp16*) input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+      i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9);
+      const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10);
+      const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11);
+      const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12);
+      const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13);
+      const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14);
+      const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15);
+      const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16);
+      const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17);
+      const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18);
+      const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19);
+      const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20);
+      const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21);
+      const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22);
+      const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23);
+      const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24);
+      const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
new file mode 100644
index 0000000..08939dc
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
@@ -0,0 +1,400 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+    const __fp16* i9 = (const __fp16*) input[9];
+    assert(i9 != NULL);
+    if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+      i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+    }
+    const __fp16* i10 = (const __fp16*) input[10];
+    assert(i10 != NULL);
+    if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+      i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+    }
+    const __fp16* i11 = (const __fp16*) input[11];
+    assert(i11 != NULL);
+    if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+      i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+    }
+    const __fp16* i12 = (const __fp16*) input[12];
+    assert(i12 != NULL);
+    if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+      i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+    }
+    const __fp16* i13 = (const __fp16*) input[13];
+    assert(i13 != NULL);
+    if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+      i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+    }
+    const __fp16* i14 = (const __fp16*) input[14];
+    assert(i14 != NULL);
+    if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+      i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+    }
+    const __fp16* i15 = (const __fp16*) input[15];
+    assert(i15 != NULL);
+    if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+      i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+    }
+    const __fp16* i16 = (const __fp16*) input[16];
+    assert(i16 != NULL);
+    if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+      i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+    }
+    const __fp16* i17 = (const __fp16*) input[17];
+    assert(i17 != NULL);
+    if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+      i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+    }
+    const __fp16* i18 = (const __fp16*) input[18];
+    assert(i18 != NULL);
+    if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+      i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+    }
+    const __fp16* i19 = (const __fp16*) input[19];
+    assert(i19 != NULL);
+    if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+      i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+    }
+    const __fp16* i20 = (const __fp16*) input[20];
+    assert(i20 != NULL);
+    if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+      i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+    }
+    const __fp16* i21 = (const __fp16*) input[21];
+    assert(i21 != NULL);
+    if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+      i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+    }
+    const __fp16* i22 = (const __fp16*) input[22];
+    assert(i22 != NULL);
+    if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+      i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+    }
+    const __fp16* i23 = (const __fp16*) input[23];
+    assert(i23 != NULL);
+    if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+      i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+    }
+    const __fp16* i24 = (const __fp16*) input[24];
+    assert(i24 != NULL);
+    if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+      i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+      const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+      const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+      const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+      const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+      const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+      const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+      const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+      const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+      const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+      const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+      const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+      const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+      const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+      const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+      const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+      const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      const float16x8_t vi9x01234567 = vld1q_f16(i9);
+      const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+      const float16x8_t vi10x01234567 = vld1q_f16(i10);
+      const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+      const float16x8_t vi11x01234567 = vld1q_f16(i11);
+      const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+      const float16x8_t vi12x01234567 = vld1q_f16(i12);
+      const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+      const float16x8_t vi13x01234567 = vld1q_f16(i13);
+      const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+      const float16x8_t vi14x01234567 = vld1q_f16(i14);
+      const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+      const float16x8_t vi15x01234567 = vld1q_f16(i15);
+      const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+      const float16x8_t vi16x01234567 = vld1q_f16(i16);
+      const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+      const float16x8_t vi17x01234567 = vld1q_f16(i17);
+      const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+      const float16x8_t vi18x01234567 = vld1q_f16(i18);
+      const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+      const float16x8_t vi19x01234567 = vld1q_f16(i19);
+      const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+      const float16x8_t vi20x01234567 = vld1q_f16(i20);
+      const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+      const float16x8_t vi21x01234567 = vld1q_f16(i21);
+      const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+      const float16x8_t vi22x01234567 = vld1q_f16(i22);
+      const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+      const float16x8_t vi23x01234567 = vld1q_f16(i23);
+      const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+      const float16x8_t vi24x01234567 = vld1q_f16(i24);
+      const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..6b6c040
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,131 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
new file mode 100644
index 0000000..ae40230
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
@@ -0,0 +1,127 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..da9efa2
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,196 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
new file mode 100644
index 0000000..e9f60a2
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
@@ -0,0 +1,192 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-dwconv/up-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    const __fp16* i0 = (const __fp16*) input[0];
+    assert(i0 != NULL);
+    if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+      i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+    }
+    const __fp16* i1 = (const __fp16*) input[1];
+    assert(i1 != NULL);
+    if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+      i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+    }
+    const __fp16* i2 = (const __fp16*) input[2];
+    assert(i2 != NULL);
+    if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+      i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+    }
+    const __fp16* i3 = (const __fp16*) input[3];
+    assert(i3 != NULL);
+    if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+      i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+    }
+    const __fp16* i4 = (const __fp16*) input[4];
+    assert(i4 != NULL);
+    if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+      i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+    }
+    const __fp16* i5 = (const __fp16*) input[5];
+    assert(i5 != NULL);
+    if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+      i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+    }
+    const __fp16* i6 = (const __fp16*) input[6];
+    assert(i6 != NULL);
+    if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+      i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+    }
+    const __fp16* i7 = (const __fp16*) input[7];
+    assert(i7 != NULL);
+    if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+      i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+    }
+    const __fp16* i8 = (const __fp16*) input[8];
+    assert(i8 != NULL);
+    if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+      i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+    }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= 8; c -= 8) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      vst1q_f16(output, vacc01234567); output += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0);
+      const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1);
+      const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2);
+      const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3);
+      const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+      const float16x8_t vi4x01234567 = vld1q_f16(i4);
+      const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+      const float16x8_t vi5x01234567 = vld1q_f16(i5);
+      const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+      const float16x8_t vi6x01234567 = vld1q_f16(i6);
+      const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+      const float16x8_t vi7x01234567 = vld1q_f16(i7);
+      const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+      const float16x8_t vi8x01234567 = vld1q_f16(i8);
+      const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+      vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/up-neonfp16arith.c.in b/src/f16-dwconv/up-neonfp16arith.c.in
new file mode 100644
index 0000000..d3d613e
--- /dev/null
+++ b/src/f16-dwconv/up-neonfp16arith.c.in
@@ -0,0 +1,154 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 8 == 0
+$assert KERNEL_TILE >= 2
+$assert ACCUMULATORS >= 1
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__neonfp16arith${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output_ptr,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  __fp16* output = ( __fp16*) output_ptr;
+  const float16x8_t vmax = vld1q_dup_f16(&params->max);
+  const float16x8_t vmin = vld1q_dup_f16(&params->min);
+  do {
+    $for K in range(KERNEL_TILE):
+      const __fp16* i${K} = (const __fp16*) input[${K}];
+      assert(i${K} != NULL);
+      if XNN_UNPREDICTABLE(i${K} != (const __fp16*) zero) {
+        i${K} = (const __fp16*) ((uintptr_t) i${K} + input_offset);
+      }
+
+    input = (const void**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const __fp16* w = (const __fp16*) weights;
+    for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
+      $for C in range(0, CHANNEL_TILE, 8):
+        float16x8_t vacc${ABC[C:C+8]}p0 = vld1q_f16(w); w += 8;
+
+      $for K in range(KERNEL_TILE):
+
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vi${K}x${ABC[C:C+8]} = vld1q_f16(i${K}); i${K} += 8;
+        $for C in range(0, CHANNEL_TILE, 8):
+          const float16x8_t vk${K}x${ABC[C:C+8]} = vld1q_f16(w); w += 8;
+        $for C in range(0, CHANNEL_TILE, 8):
+          $if 1 <= K < ACCUMULATORS:
+            float16x8_t vacc${ABC[C:C+8]}p${K} = vmulq_f16(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]});
+          $else:
+            vacc${ABC[C:C+8]}p${K % ACCUMULATORS} = vfmaq_f16(vacc${ABC[C:C+8]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]});
+
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+        $ACC_STEP = 1
+        $while ACC_STEP < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+            $if A + ACC_STEP < ACCUMULATORS:
+              $for C in range(0, CHANNEL_TILE, 8):
+                vacc${ABC[C:C+8]}p${A} = vaddq_f16(vacc${ABC[C:C+8]}p${A}, vacc${ABC[C:C+8]}p${A + ACC_STEP});
+          $ACC_STEP *= 2
+
+      $for C in range(0, CHANNEL_TILE, 8):
+        float16x8_t vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}p0, vmin);
+      $for C in range(0, CHANNEL_TILE, 8):
+        vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax);
+
+      $for C in range(0, CHANNEL_TILE, 8):
+        vst1q_f16(output, vacc${ABC[C:C+8]}); output += 8;
+    }
+    $if CHANNEL_TILE > 8:
+      for (; c >= 8; c -= 8) {
+        float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+        $for K in range(KERNEL_TILE):
+
+          const float16x8_t vi${K}x01234567 = vld1q_f16(i${K}); i${K} += 8;
+          const float16x8_t vk${K}x01234567 = vld1q_f16(w + ${(K + 1) * CHANNEL_TILE - 8});
+          $if 1 <= K < ACCUMULATORS:
+            float16x8_t vacc01234567p${K} = vmulq_f16(vi${K}x01234567, vk${K}x01234567);
+          $else:
+            vacc01234567p${K % ACCUMULATORS} = vfmaq_f16(vacc01234567p${K % ACCUMULATORS}, vi${K}x01234567, vk${K}x01234567);
+
+        $if ACCUMULATORS > 1:
+          // Add up all accumulators to vacc01234567p0
+          $ACC_STEP = 1
+          $while ACC_STEP < ACCUMULATORS:
+            $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+              $if A + ACC_STEP < ACCUMULATORS:
+                vacc01234567p${A} = vaddq_f16(vacc01234567p${A}, vacc01234567p${A + ACC_STEP});
+            $ACC_STEP *= 2
+
+        float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+        vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+        vst1q_f16(output, vacc01234567); output += 8;
+      }
+    if XNN_UNLIKELY(c != 0) {
+      $if CHANNEL_TILE == 8:
+        float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+      $else:
+        float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+      $for K in range(KERNEL_TILE):
+
+        const float16x8_t vi${K}x01234567 = vld1q_f16(i${K});
+        $if CHANNEL_TILE == 8:
+          const float16x8_t vk${K}x01234567 = vld1q_f16(w); w += 8;
+        $else:
+          const float16x8_t vk${K}x01234567 = vld1q_f16(w + ${(K + 1) * CHANNEL_TILE});
+        $if 1 <= K < ACCUMULATORS:
+          float16x8_t vacc01234567p${K} = vmulq_f16(vi${K}x01234567, vk${K}x01234567);
+        $else:
+          vacc01234567p${K % ACCUMULATORS} = vfmaq_f16(vacc01234567p${K % ACCUMULATORS}, vi${K}x01234567, vk${K}x01234567);
+
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc01234567p0
+        $ACC_STEP = 1
+        $while ACC_STEP < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+            $if A + ACC_STEP < ACCUMULATORS:
+              vacc01234567p${A} = vaddq_f16(vacc01234567p${A}, vacc01234567p${A + ACC_STEP});
+          $ACC_STEP *= 2
+
+      float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (c & 4) {
+        vst1_f16(output, vacc0123); output += 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (c & 2) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (c & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output += 1;
+      }
+    }
+
+    output = (__fp16*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index ed11f29..d316a24 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -193,6 +193,44 @@
 DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_up2x25__scalar)
 DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_up2x25__scalar_acc2)
 
+#define DECLARE_F16_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                 \
+    size_t channels,                                         \
+    size_t output_width,                                     \
+    const void** input,                                      \
+    const void* weights,                                     \
+    void* output,                                            \
+    size_t input_stride,                                     \
+    size_t output_increment,                                 \
+    size_t input_offset,                                     \
+    const void* zero,                                        \
+    const struct xnn_f16_default_params* params);
+
+#define DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                                        \
+    size_t channels,                                                \
+    size_t output_width,                                            \
+    const void** input,                                             \
+    const void* weights,                                            \
+    void* output,                                                   \
+    size_t input_stride,                                            \
+    size_t output_increment,                                        \
+    size_t input_offset,                                            \
+    const void* zero,                                               \
+    const struct xnn_f16_minmax_params* params);
+
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2)
 
 #define DECLARE_Q8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                                       \
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 952e2cc..2b4e877 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -837,6 +837,18 @@
     const float* zero,
     const union xnn_f32_minmax_params* params);
 
+typedef void (*xnn_f16_dwconv_minmax_unipass_ukernel_function)(
+    size_t channels,
+    size_t output_width,
+    const void** input,
+    const void* weights,
+    void* output,
+    size_t input_stride,
+    size_t output_increment,
+    size_t input_offset,
+    const void* zero,
+    const struct xnn_f16_minmax_params* params);
+
 typedef void (*xnn_q8_dwconv_minmax_unipass_ukernel_function)(
     size_t channels,
     size_t output_width,
diff --git a/test/dwconv-microkernel-tester.h b/test/dwconv-microkernel-tester.h
index bbfddff..f81c229 100644
--- a/test/dwconv-microkernel-tester.h
+++ b/test/dwconv-microkernel-tester.h
@@ -20,6 +20,8 @@
 #include <random>
 #include <vector>
 
+#include <fp16.h>
+
 #include <xnnpack.h>
 #include <xnnpack/AlignedAllocator.h>
 #include <xnnpack/pack.h>
@@ -283,6 +285,99 @@
     }
   }
 
+  void Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
+    auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+    std::vector<const uint16_t*> indirection((width() - 1) * step() + kr());
+    std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels());
+    std::vector<uint16_t> kernel(channels() * kr());
+    std::vector<uint16_t> bias(channels());
+    std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((kr() + 1) * packed_channels());
+    std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+    std::vector<uint16_t> output((width() - 1) * output_stride() + channels());
+    std::vector<float> output_ref(width() * channels());
+
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(f16rng));
+      std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
+      std::generate(bias.begin(), bias.end(), std::ref(f16rng));
+      std::fill(zero.begin(), zero.end(), 0);
+      std::fill(output_ref.begin(), output_ref.end(), 0.0f);
+      std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+      std::fill(packed_weights.begin(), packed_weights.end(), 0);
+      xnn_pack_f16_dwconv_ghw_w(
+        kr(), 1, channels(), cr(),
+        kernel.data(), bias.data(), packed_weights.data());
+      for (size_t i = 0; i < indirection.size(); i++) {
+        indirection[i] = input.data() + i * channels() - input_offset();
+      }
+      std::shuffle(indirection.begin(), indirection.end(), rng);
+      if (zero_index() != SIZE_MAX) {
+        for (size_t i = 0; i < indirection.size(); i += kr()) {
+          indirection[i + zero_index()] = zero.data();
+        }
+      }
+
+      // Compute reference results, without clamping.
+      for (size_t x = 0; x < width(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          float acc = fp16_ieee_to_fp32_value(bias[c]);
+          for (size_t k = 0; k < kr(); k++) {
+            if (indirection[x * step() + k] != zero.data()) {
+              acc += fp16_ieee_to_fp32_value(indirection[x * step() + k][c + input_offset()]) * fp16_ieee_to_fp32_value(kernel[c * kr() + k]);
+            }
+          }
+          output_ref[x * channels() + c] = acc;
+        }
+      }
+
+      // Compute clamping parameters.
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_range = accumulated_max - accumulated_min;
+      const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
+      const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
+
+      // Prepare parameters.
+      xnn_f16_minmax_params params = xnn_init_f16_minmax_params(
+        fp16_ieee_from_fp32_value(output_min),
+        fp16_ieee_from_fp32_value(output_max));
+
+      // Clamp reference results.
+      for (float& output_val : output_ref) {
+        output_val = std::max(std::min(output_val, output_max), output_min);
+      }
+
+      // Call optimized micro-kernel.
+      dwconv_minmax(
+        channels(), width(),
+        reinterpret_cast<const void**>(indirection.data()), packed_weights.data(), output.data(),
+        step() * sizeof(void*),
+        (output_stride() - channels()) * sizeof(uint16_t),
+        input_offset() * sizeof(uint16_t), zero.data(),
+        &params);
+
+      // Verify results.
+      for (size_t x = 0; x < width(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min)
+            << "x = " << x << ", channel = " << c;
+          ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max)
+            << "x = " << x << ", channel = " << c;
+          ASSERT_NEAR(
+              output_ref[x * channels() + c],
+              fp16_ieee_to_fp32_value(output[x * output_stride() + c]),
+              std::abs(output_ref[x * channels() + c]) * 1.0e-2)
+            << "x = " << x << ", channel = " << c;
+        }
+      }
+    }
+  }
+
   void Test(xnn_f32_dwconv_unipass_ukernel_function dwconv) const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
diff --git a/test/f16-dwconv-minmax.cc b/test/f16-dwconv-minmax.cc
new file mode 100644
index 0000000..06c5f63
--- /dev/null
+++ b/test/f16-dwconv-minmax.cc
@@ -0,0 +1,2252 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/f16-dwconv-minmax.yaml
+//   Generator: tools/generate-dwconv-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/dwconv.h>
+#include "dwconv-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(25)
+      .channels(16)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_div_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_div_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_gt_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_gt_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(25)
+      .channels(16)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_div_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_div_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_gt_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_gt_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(25)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 25; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(25)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(9)
+      .channels(16)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_div_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_div_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_gt_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_gt_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(9)
+      .channels(16)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_div_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_div_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_gt_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_gt_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(9)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 9; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(9)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(4)
+      .channels(8)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(4)
+      .channels(8)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .input_offset(176)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (uint32_t channels = 16; channels < 128; channels += 24) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .input_offset(176)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(4)
+      .channels(16)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_div_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_div_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_gt_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_gt_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(4)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_eq_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    DWConvMicrokernelTester()
+      .cr(16)
+      .kr(4)
+      .channels(16)
+      .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_div_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_div_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_div_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_lt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 1; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_gt_16) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_gt_16_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_gt_16_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 17; channels < 32; channels++) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+      }
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(16)
+        .width(5)
+        .output_stride(83)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels <= 80; channels += 15) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, input_offset) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t channels = 32; channels < 256; channels += 48) {
+      DWConvMicrokernelTester()
+        .cr(16)
+        .kr(4)
+        .channels(channels)
+        .input_offset(304)
+        .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+    }
+  }
+
+  TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, zero) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (uint32_t channels = 32; channels < 256; channels += 48) {
+        DWConvMicrokernelTester()
+          .cr(16)
+          .kr(4)
+          .channels(channels)
+          .input_offset(304)
+          .zero_index(mz)
+          .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f16-dwconv-minmax.yaml b/test/f16-dwconv-minmax.yaml
new file mode 100644
index 0000000..114c736
--- /dev/null
+++ b/test/f16-dwconv-minmax.yaml
@@ -0,0 +1,40 @@
+# Copyright 2020 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith
+  arch:
+    - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2
+  arch:
+    - aarch64