FP16 DWCONV microkernel
PiperOrigin-RevId: 314854152
diff --git a/BUILD.bazel b/BUILD.bazel
index 4bb4c83..5c95329 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -998,6 +998,18 @@
AARCH64_NEONFP16ARITH_UKERNELS = [
"src/f16-clamp/gen/neonfp16arith-x16.c",
"src/f16-clamp/gen/neonfp16arith-x8.c",
+ "src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c",
+ "src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c",
+ "src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c",
+ "src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c",
+ "src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c",
+ "src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c",
+ "src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c",
+ "src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c",
+ "src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c",
+ "src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c",
+ "src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c",
+ "src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c",
"src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c",
"src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c",
"src/f16-hswish/gen/neonfp16arith-x16.c",
@@ -2906,6 +2918,17 @@
)
xnnpack_benchmark(
+ name = "f16_dwconv_bench",
+ srcs = [
+ "bench/f16-dwconv.cc",
+ "bench/dwconv.h",
+ "bench/google/dwconv.h",
+ "src/xnnpack/AlignedAllocator.h",
+ ] + MICROKERNEL_BENCHMARK_HDRS,
+ deps = MICROKERNEL_BENCHMARK_DEPS + [":indirection"],
+)
+
+xnnpack_benchmark(
name = "f32_dwconv_bench",
srcs = [
"bench/f32-dwconv.cc",
@@ -3433,6 +3456,16 @@
)
xnnpack_unit_test(
+ name = "f16_dwconv_minmax_test",
+ srcs = [
+ "test/f16-dwconv-minmax.cc",
+ "test/dwconv-microkernel-tester.h",
+ "src/xnnpack/AlignedAllocator.h",
+ ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
+ deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
name = "f32_dwconv_minmax_test",
srcs = [
"test/f32-dwconv-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed8979d..dd9a8d4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1001,6 +1001,18 @@
SET(XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
src/f16-clamp/gen/neonfp16arith-x16.c
src/f16-clamp/gen/neonfp16arith-x8.c
+ src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
+ src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
+ src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
+ src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
+ src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
+ src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
+ src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
+ src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
+ src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
+ src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
+ src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
+ src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
src/f16-hswish/gen/neonfp16arith-x16.c
@@ -2304,6 +2316,15 @@
TARGET_LINK_LIBRARIES(f32-dwconv-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
ADD_TEST(f32-dwconv-test f32-dwconv-test)
+ ADD_EXECUTABLE(f16-dwconv-minmax-test test/f16-dwconv-minmax.cc)
+ SET_TARGET_PROPERTIES(f16-dwconv-minmax-test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(f16-dwconv-minmax-test PRIVATE src test)
+ TARGET_LINK_LIBRARIES(f16-dwconv-minmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+ ADD_TEST(f16-dwconv-minmax-test f16-dwconv-minmax-test)
+
ADD_EXECUTABLE(f32-dwconv-minmax-test test/f32-dwconv-minmax.cc)
SET_TARGET_PROPERTIES(f32-dwconv-minmax-test PROPERTIES
CXX_STANDARD 11
@@ -3223,6 +3244,15 @@
TARGET_INCLUDE_DIRECTORIES(f32-dwconv-chw-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
TARGET_LINK_LIBRARIES(f32-dwconv-chw-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
+ ADD_EXECUTABLE(f16-dwconv-bench bench/f16-dwconv.cc)
+ SET_TARGET_PROPERTIES(f16-dwconv-bench PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(f16-dwconv-bench PRIVATE src)
+ TARGET_INCLUDE_DIRECTORIES(f16-dwconv-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+ TARGET_LINK_LIBRARIES(f16-dwconv-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
+
ADD_EXECUTABLE(f32-dwconv-bench bench/f32-dwconv.cc)
SET_TARGET_PROPERTIES(f32-dwconv-bench PROPERTIES
CXX_STANDARD 11
diff --git a/bench/f16-dwconv.cc b/bench/f16-dwconv.cc
new file mode 100644
index 0000000..926a914
--- /dev/null
+++ b/bench/f16-dwconv.cc
@@ -0,0 +1,234 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cfloat>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <cpuinfo.h>
+
+#include <benchmark/benchmark.h>
+#include <fp16/fp16.h>
+#include "bench/dwconv.h"
+#include "bench/utils.h"
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/dwconv.h>
+#include <xnnpack/indirection.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/pack.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/params.h>
+
+
+static void DWConvBenchmark(benchmark::State& state,
+ xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv,
+ uint32_t cr, uint32_t kr,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+ if (!cpuinfo_initialize()) {
+ state.SkipWithError("cpuinfo initialization failed");
+ return;
+ }
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
+
+ const size_t input_height = state.range(0);
+ const size_t input_width = state.range(1);
+ const size_t kernel_height = state.range(2);
+ const size_t kernel_width = state.range(3);
+ const size_t padding_height = state.range(4);
+ const size_t padding_width = state.range(5);
+ const size_t subsampling = state.range(6);
+ const size_t dilation = state.range(7);
+ const size_t channels = state.range(8);
+
+ const size_t kernel_size = kernel_height * kernel_width;
+ if (kernel_size != kr) {
+ state.SkipWithError("kernel size mismatch");
+ return;
+ }
+
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
+ const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
+ const size_t padding_left = padding_width / 2;
+ const size_t padding_top = padding_height / 2;
+ const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
+ const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
+ const size_t output_size = output_height * output_width;
+ const size_t step_width = dilation == 1 ? subsampling : kernel_width;
+ const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
+
+ const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, cr);
+
+ std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
+ std::generate(a.begin(), a.end(), std::ref(f16rng));
+ std::vector<uint16_t> k(channels * kernel_height * kernel_width);
+ std::generate(k.begin(), k.end(), std::ref(f16rng));
+ std::vector<uint16_t> b(channels);
+ std::generate(b.begin(), b.end(), std::ref(f16rng));
+
+ std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
+
+ const size_t w_elements = (kernel_size + 1) * c_stride;
+ const size_t i_elements = output_height * step_height;
+ const size_t c_elements = output_size * channels;
+ const size_t num_buffers = 1 +
+ benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
+ sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
+
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 32>> w(w_elements * num_buffers);
+ std::fill(w.begin(), w.end(), 0.0f);
+ xnn_pack_f16_dwconv_ghw_w(kernel_height, kernel_width, channels, cr,
+ k.data(), b.data(), w.data());
+ for (size_t n = 1; n < num_buffers; n++) {
+ std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
+ }
+
+ std::vector<const uint16_t*> i(i_elements * num_buffers);
+ xnn_operator convolution_op = { };
+ convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
+ convolution_op.input = a.data();
+ convolution_op.input_pixel_stride = channels;
+ convolution_op.zero_buffer = z.data();
+ convolution_op.batch_size = 1;
+ convolution_op.input_height = input_height;
+ convolution_op.input_width = input_width;
+ convolution_op.output_height = output_height;
+ convolution_op.output_width = output_width;
+ convolution_op.kernel_height = kernel_height;
+ convolution_op.kernel_width = kernel_width;
+ convolution_op.stride_height = subsampling;
+ convolution_op.stride_width = subsampling;
+ convolution_op.dilation_height = dilation;
+ convolution_op.dilation_width = dilation;
+ convolution_op.padding_top = padding_top;
+ convolution_op.padding_left = padding_left;
+
+ xnn_indirection_init_dwconv2d(&convolution_op, 0, step_height, step_width, 1 /* log2(sizeof(uint16_t)) */);
+ for (size_t n = 1; n < num_buffers; n++) {
+ std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
+ }
+
+ std::vector<uint16_t> c(c_elements * num_buffers);
+ std::fill(c.begin(), c.end(), std::nanf(""));
+
+ xnn_f16_minmax_params params =
+ xnn_init_f16_minmax_params(-std::numeric_limits<uint16_t>::infinity(), +std::numeric_limits<uint16_t>::infinity());
+
+ size_t buffer_index = 0;
+ for (auto _ : state) {
+ state.PauseTiming();
+ benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
+ buffer_index = (buffer_index + 1) % num_buffers;
+ state.ResumeTiming();
+
+ for (uint32_t y = 0; y < output_height; y++) {
+ dwconv(channels, output_width,
+ reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
+ w.data() + buffer_index * w_elements,
+ c.data() + buffer_index * c_elements + y * output_width * channels,
+ kernel_height * step_width * sizeof(void*), 0,
+ 0, z.data(), ¶ms);
+ }
+ }
+
+ state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
+ state.counters["FLOPS"] = benchmark::Counter(
+ uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
+ benchmark::Counter::kIsRate);
+
+ state.counters["BYTES"] = benchmark::Counter(
+ uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
+ benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM64
+ static void f16_dwconv_8x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2, 8, 25,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_8x25__neonfp16arith(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith, 8, 25,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_8x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2, 8, 4,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_8x4__neonfp16arith(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith, 8, 4,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_8x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2, 8, 9,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_8x9__neonfp16arith(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith, 8, 9,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_16x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2, 16, 25,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_16x25__neonfp16arith(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith, 16, 25,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_16x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2, 16, 4,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_16x4__neonfp16arith(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith, 16, 4,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_16x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2, 16, 9,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ static void f16_dwconv_16x9__neonfp16arith(benchmark::State& state, const char* net) {
+ DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith, 16, 9,
+ benchmark::utils::CheckNEONFP16ARITH);
+ }
+
+ BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith_acc2)
+ BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith)
+ BENCHMARK_DWCONV(f16_dwconv_8x4__neonfp16arith_acc2)
+ BENCHMARK_DWCONV(f16_dwconv_8x4__neonfp16arith)
+ BENCHMARK_DWCONV(f16_dwconv_8x9__neonfp16arith_acc2)
+ BENCHMARK_DWCONV(f16_dwconv_8x9__neonfp16arith)
+ BENCHMARK_DWCONV(f16_dwconv_16x25__neonfp16arith_acc2)
+ BENCHMARK_DWCONV(f16_dwconv_16x25__neonfp16arith)
+ BENCHMARK_DWCONV(f16_dwconv_16x4__neonfp16arith_acc2)
+ BENCHMARK_DWCONV(f16_dwconv_16x4__neonfp16arith)
+ BENCHMARK_DWCONV(f16_dwconv_16x9__neonfp16arith_acc2)
+ BENCHMARK_DWCONV(f16_dwconv_16x9__neonfp16arith)
+#endif // XNN_ARCH_ARM64
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif
diff --git a/scripts/generate-f16-dwconv.sh b/scripts/generate-f16-dwconv.sh
new file mode 100755
index 0000000..c81a818
--- /dev/null
+++ b/scripts/generate-f16-dwconv.sh
@@ -0,0 +1,25 @@
+#!/bin/sh
+# Copyright 2020 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+################################### ARM NEON ##################################
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
+
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
+
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
+tools/xngen src/f16-dwconv/up-neonfp16arith.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
+
+################################## Unit tests #################################
+tools/generate-dwconv-test.py --spec test/f16-dwconv-minmax.yaml --output test/f16-dwconv-minmax.cc
diff --git a/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..162f3e6
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,596 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+ const __fp16* i9 = (const __fp16*) input[9];
+ assert(i9 != NULL);
+ if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+ i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+ }
+ const __fp16* i10 = (const __fp16*) input[10];
+ assert(i10 != NULL);
+ if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+ i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+ }
+ const __fp16* i11 = (const __fp16*) input[11];
+ assert(i11 != NULL);
+ if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+ i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+ }
+ const __fp16* i12 = (const __fp16*) input[12];
+ assert(i12 != NULL);
+ if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+ i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+ }
+ const __fp16* i13 = (const __fp16*) input[13];
+ assert(i13 != NULL);
+ if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+ i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+ }
+ const __fp16* i14 = (const __fp16*) input[14];
+ assert(i14 != NULL);
+ if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+ i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+ }
+ const __fp16* i15 = (const __fp16*) input[15];
+ assert(i15 != NULL);
+ if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+ i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+ }
+ const __fp16* i16 = (const __fp16*) input[16];
+ assert(i16 != NULL);
+ if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+ i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+ }
+ const __fp16* i17 = (const __fp16*) input[17];
+ assert(i17 != NULL);
+ if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+ i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+ }
+ const __fp16* i18 = (const __fp16*) input[18];
+ assert(i18 != NULL);
+ if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+ i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+ }
+ const __fp16* i19 = (const __fp16*) input[19];
+ assert(i19 != NULL);
+ if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+ i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+ }
+ const __fp16* i20 = (const __fp16*) input[20];
+ assert(i20 != NULL);
+ if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+ i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+ }
+ const __fp16* i21 = (const __fp16*) input[21];
+ assert(i21 != NULL);
+ if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+ i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+ }
+ const __fp16* i22 = (const __fp16*) input[22];
+ assert(i22 != NULL);
+ if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+ i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+ }
+ const __fp16* i23 = (const __fp16*) input[23];
+ assert(i23 != NULL);
+ if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+ i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+ }
+ const __fp16* i24 = (const __fp16*) input[24];
+ assert(i24 != NULL);
+ if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+ i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+ float16x8_t vacc89ABCDEFp1 = vmulq_f16(vi1x89ABCDEF, vk1x89ABCDEF);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi3x89ABCDEF, vk3x89ABCDEF);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi5x89ABCDEF, vk5x89ABCDEF);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi7x89ABCDEF, vk7x89ABCDEF);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vi9x89ABCDEF = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk9x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi9x89ABCDEF, vk9x89ABCDEF);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vi10x89ABCDEF = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk10x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi10x89ABCDEF, vk10x89ABCDEF);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vi11x89ABCDEF = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk11x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi11x89ABCDEF, vk11x89ABCDEF);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vi12x89ABCDEF = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk12x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi12x89ABCDEF, vk12x89ABCDEF);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vi13x89ABCDEF = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk13x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi13x89ABCDEF, vk13x89ABCDEF);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vi14x89ABCDEF = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk14x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi14x89ABCDEF, vk14x89ABCDEF);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vi15x89ABCDEF = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk15x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi15x89ABCDEF, vk15x89ABCDEF);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vi16x89ABCDEF = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk16x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi16x89ABCDEF, vk16x89ABCDEF);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vi17x89ABCDEF = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk17x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi17x89ABCDEF, vk17x89ABCDEF);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vi18x89ABCDEF = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk18x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi18x89ABCDEF, vk18x89ABCDEF);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vi19x89ABCDEF = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk19x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi19x89ABCDEF, vk19x89ABCDEF);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vi20x89ABCDEF = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk20x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi20x89ABCDEF, vk20x89ABCDEF);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vi21x89ABCDEF = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk21x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi21x89ABCDEF, vk21x89ABCDEF);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vi22x89ABCDEF = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk22x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi22x89ABCDEF, vk22x89ABCDEF);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vi23x89ABCDEF = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk23x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi23x89ABCDEF, vk23x89ABCDEF);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vi24x89ABCDEF = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk24x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi24x89ABCDEF, vk24x89ABCDEF);
+
+ // Add up all accumulators to vacc0123456789ABCDEFp0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+ vacc89ABCDEFp0 = vaddq_f16(vacc89ABCDEFp0, vacc89ABCDEFp1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ vst1q_f16(output, vacc89ABCDEF); output += 8;
+ }
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vk9x01234567 = vld1q_f16(w + 152);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vk10x01234567 = vld1q_f16(w + 168);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vk11x01234567 = vld1q_f16(w + 184);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vk12x01234567 = vld1q_f16(w + 200);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vk13x01234567 = vld1q_f16(w + 216);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vk14x01234567 = vld1q_f16(w + 232);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vk15x01234567 = vld1q_f16(w + 248);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vk16x01234567 = vld1q_f16(w + 264);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vk17x01234567 = vld1q_f16(w + 280);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vk18x01234567 = vld1q_f16(w + 296);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vk19x01234567 = vld1q_f16(w + 312);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vk20x01234567 = vld1q_f16(w + 328);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vk21x01234567 = vld1q_f16(w + 344);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vk22x01234567 = vld1q_f16(w + 360);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vk23x01234567 = vld1q_f16(w + 376);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vk24x01234567 = vld1q_f16(w + 392);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9);
+ const float16x8_t vk9x01234567 = vld1q_f16(w + 160);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10);
+ const float16x8_t vk10x01234567 = vld1q_f16(w + 176);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11);
+ const float16x8_t vk11x01234567 = vld1q_f16(w + 192);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12);
+ const float16x8_t vk12x01234567 = vld1q_f16(w + 208);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13);
+ const float16x8_t vk13x01234567 = vld1q_f16(w + 224);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14);
+ const float16x8_t vk14x01234567 = vld1q_f16(w + 240);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15);
+ const float16x8_t vk15x01234567 = vld1q_f16(w + 256);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16);
+ const float16x8_t vk16x01234567 = vld1q_f16(w + 272);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17);
+ const float16x8_t vk17x01234567 = vld1q_f16(w + 288);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18);
+ const float16x8_t vk18x01234567 = vld1q_f16(w + 304);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19);
+ const float16x8_t vk19x01234567 = vld1q_f16(w + 320);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20);
+ const float16x8_t vk20x01234567 = vld1q_f16(w + 336);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21);
+ const float16x8_t vk21x01234567 = vld1q_f16(w + 352);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22);
+ const float16x8_t vk22x01234567 = vld1q_f16(w + 368);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23);
+ const float16x8_t vk23x01234567 = vld1q_f16(w + 384);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24);
+ const float16x8_t vk24x01234567 = vld1q_f16(w + 400);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
new file mode 100644
index 0000000..83dd9d8
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x25-minmax-neonfp16arith.c
@@ -0,0 +1,589 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+ const __fp16* i9 = (const __fp16*) input[9];
+ assert(i9 != NULL);
+ if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+ i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+ }
+ const __fp16* i10 = (const __fp16*) input[10];
+ assert(i10 != NULL);
+ if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+ i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+ }
+ const __fp16* i11 = (const __fp16*) input[11];
+ assert(i11 != NULL);
+ if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+ i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+ }
+ const __fp16* i12 = (const __fp16*) input[12];
+ assert(i12 != NULL);
+ if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+ i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+ }
+ const __fp16* i13 = (const __fp16*) input[13];
+ assert(i13 != NULL);
+ if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+ i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+ }
+ const __fp16* i14 = (const __fp16*) input[14];
+ assert(i14 != NULL);
+ if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+ i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+ }
+ const __fp16* i15 = (const __fp16*) input[15];
+ assert(i15 != NULL);
+ if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+ i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+ }
+ const __fp16* i16 = (const __fp16*) input[16];
+ assert(i16 != NULL);
+ if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+ i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+ }
+ const __fp16* i17 = (const __fp16*) input[17];
+ assert(i17 != NULL);
+ if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+ i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+ }
+ const __fp16* i18 = (const __fp16*) input[18];
+ assert(i18 != NULL);
+ if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+ i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+ }
+ const __fp16* i19 = (const __fp16*) input[19];
+ assert(i19 != NULL);
+ if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+ i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+ }
+ const __fp16* i20 = (const __fp16*) input[20];
+ assert(i20 != NULL);
+ if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+ i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+ }
+ const __fp16* i21 = (const __fp16*) input[21];
+ assert(i21 != NULL);
+ if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+ i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+ }
+ const __fp16* i22 = (const __fp16*) input[22];
+ assert(i22 != NULL);
+ if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+ i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+ }
+ const __fp16* i23 = (const __fp16*) input[23];
+ assert(i23 != NULL);
+ if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+ i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+ }
+ const __fp16* i24 = (const __fp16*) input[24];
+ assert(i24 != NULL);
+ if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+ i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi1x89ABCDEF, vk1x89ABCDEF);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi3x89ABCDEF, vk3x89ABCDEF);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi5x89ABCDEF, vk5x89ABCDEF);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi7x89ABCDEF, vk7x89ABCDEF);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vi9x89ABCDEF = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk9x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi9x89ABCDEF, vk9x89ABCDEF);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vi10x89ABCDEF = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk10x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi10x89ABCDEF, vk10x89ABCDEF);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vi11x89ABCDEF = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk11x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi11x89ABCDEF, vk11x89ABCDEF);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vi12x89ABCDEF = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk12x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi12x89ABCDEF, vk12x89ABCDEF);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vi13x89ABCDEF = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk13x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi13x89ABCDEF, vk13x89ABCDEF);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vi14x89ABCDEF = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk14x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi14x89ABCDEF, vk14x89ABCDEF);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vi15x89ABCDEF = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk15x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi15x89ABCDEF, vk15x89ABCDEF);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vi16x89ABCDEF = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk16x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi16x89ABCDEF, vk16x89ABCDEF);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vi17x89ABCDEF = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk17x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi17x89ABCDEF, vk17x89ABCDEF);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vi18x89ABCDEF = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk18x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi18x89ABCDEF, vk18x89ABCDEF);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vi19x89ABCDEF = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk19x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi19x89ABCDEF, vk19x89ABCDEF);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vi20x89ABCDEF = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk20x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi20x89ABCDEF, vk20x89ABCDEF);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vi21x89ABCDEF = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk21x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi21x89ABCDEF, vk21x89ABCDEF);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vi22x89ABCDEF = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk22x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi22x89ABCDEF, vk22x89ABCDEF);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vi23x89ABCDEF = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk23x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi23x89ABCDEF, vk23x89ABCDEF);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vi24x89ABCDEF = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk24x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi24x89ABCDEF, vk24x89ABCDEF);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ vst1q_f16(output, vacc89ABCDEF); output += 8;
+ }
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vk9x01234567 = vld1q_f16(w + 152);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vk10x01234567 = vld1q_f16(w + 168);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vk11x01234567 = vld1q_f16(w + 184);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vk12x01234567 = vld1q_f16(w + 200);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vk13x01234567 = vld1q_f16(w + 216);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vk14x01234567 = vld1q_f16(w + 232);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vk15x01234567 = vld1q_f16(w + 248);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vk16x01234567 = vld1q_f16(w + 264);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vk17x01234567 = vld1q_f16(w + 280);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vk18x01234567 = vld1q_f16(w + 296);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vk19x01234567 = vld1q_f16(w + 312);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vk20x01234567 = vld1q_f16(w + 328);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vk21x01234567 = vld1q_f16(w + 344);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vk22x01234567 = vld1q_f16(w + 360);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vk23x01234567 = vld1q_f16(w + 376);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vk24x01234567 = vld1q_f16(w + 392);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9);
+ const float16x8_t vk9x01234567 = vld1q_f16(w + 160);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10);
+ const float16x8_t vk10x01234567 = vld1q_f16(w + 176);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11);
+ const float16x8_t vk11x01234567 = vld1q_f16(w + 192);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12);
+ const float16x8_t vk12x01234567 = vld1q_f16(w + 208);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13);
+ const float16x8_t vk13x01234567 = vld1q_f16(w + 224);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14);
+ const float16x8_t vk14x01234567 = vld1q_f16(w + 240);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15);
+ const float16x8_t vk15x01234567 = vld1q_f16(w + 256);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16);
+ const float16x8_t vk16x01234567 = vld1q_f16(w + 272);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17);
+ const float16x8_t vk17x01234567 = vld1q_f16(w + 288);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18);
+ const float16x8_t vk18x01234567 = vld1q_f16(w + 304);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19);
+ const float16x8_t vk19x01234567 = vld1q_f16(w + 320);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20);
+ const float16x8_t vk20x01234567 = vld1q_f16(w + 336);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21);
+ const float16x8_t vk21x01234567 = vld1q_f16(w + 352);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22);
+ const float16x8_t vk22x01234567 = vld1q_f16(w + 368);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23);
+ const float16x8_t vk23x01234567 = vld1q_f16(w + 384);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24);
+ const float16x8_t vk24x01234567 = vld1q_f16(w + 400);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..016daba
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,176 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+ float16x8_t vacc89ABCDEFp1 = vmulq_f16(vi1x89ABCDEF, vk1x89ABCDEF);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi3x89ABCDEF, vk3x89ABCDEF);
+
+ // Add up all accumulators to vacc0123456789ABCDEFp0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+ vacc89ABCDEFp0 = vaddq_f16(vacc89ABCDEFp0, vacc89ABCDEFp1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ vst1q_f16(output, vacc89ABCDEF); output += 8;
+ }
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
new file mode 100644
index 0000000..a72d2e9
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
@@ -0,0 +1,169 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi1x89ABCDEF, vk1x89ABCDEF);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi3x89ABCDEF, vk3x89ABCDEF);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ vst1q_f16(output, vacc89ABCDEF); output += 8;
+ }
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..68687d5
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,276 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+ float16x8_t vacc89ABCDEFp1 = vmulq_f16(vi1x89ABCDEF, vk1x89ABCDEF);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi3x89ABCDEF, vk3x89ABCDEF);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi5x89ABCDEF, vk5x89ABCDEF);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+ vacc89ABCDEFp1 = vfmaq_f16(vacc89ABCDEFp1, vi7x89ABCDEF, vk7x89ABCDEF);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+ // Add up all accumulators to vacc0123456789ABCDEFp0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+ vacc89ABCDEFp0 = vaddq_f16(vacc89ABCDEFp0, vacc89ABCDEFp1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ vst1q_f16(output, vacc89ABCDEF); output += 8;
+ }
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
new file mode 100644
index 0000000..e322eda
--- /dev/null
+++ b/src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
@@ -0,0 +1,269 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ float16x8_t vacc89ABCDEFp0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk0x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi0x89ABCDEF, vk0x89ABCDEF);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk1x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi1x89ABCDEF, vk1x89ABCDEF);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk2x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi2x89ABCDEF, vk2x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk3x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi3x89ABCDEF, vk3x89ABCDEF);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk4x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi4x89ABCDEF, vk4x89ABCDEF);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk5x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi5x89ABCDEF, vk5x89ABCDEF);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk6x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi6x89ABCDEF, vk6x89ABCDEF);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vi7x89ABCDEF = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk7x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi7x89ABCDEF, vk7x89ABCDEF);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vi8x89ABCDEF = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ const float16x8_t vk8x89ABCDEF = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+ vacc89ABCDEFp0 = vfmaq_f16(vacc89ABCDEFp0, vi8x89ABCDEF, vk8x89ABCDEF);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ float16x8_t vacc89ABCDEF = vmaxq_f16(vacc89ABCDEFp0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ vst1q_f16(output, vacc89ABCDEF); output += 8;
+ }
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 8);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 24);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 40);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 56);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 72);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 88);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 104);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 120);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 136);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w + 16);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w + 32);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w + 48);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w + 64);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w + 80);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w + 96);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w + 112);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w + 128);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w + 144);
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..a67bb8d
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,404 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+ const __fp16* i9 = (const __fp16*) input[9];
+ assert(i9 != NULL);
+ if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+ i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+ }
+ const __fp16* i10 = (const __fp16*) input[10];
+ assert(i10 != NULL);
+ if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+ i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+ }
+ const __fp16* i11 = (const __fp16*) input[11];
+ assert(i11 != NULL);
+ if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+ i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+ }
+ const __fp16* i12 = (const __fp16*) input[12];
+ assert(i12 != NULL);
+ if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+ i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+ }
+ const __fp16* i13 = (const __fp16*) input[13];
+ assert(i13 != NULL);
+ if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+ i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+ }
+ const __fp16* i14 = (const __fp16*) input[14];
+ assert(i14 != NULL);
+ if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+ i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+ }
+ const __fp16* i15 = (const __fp16*) input[15];
+ assert(i15 != NULL);
+ if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+ i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+ }
+ const __fp16* i16 = (const __fp16*) input[16];
+ assert(i16 != NULL);
+ if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+ i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+ }
+ const __fp16* i17 = (const __fp16*) input[17];
+ assert(i17 != NULL);
+ if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+ i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+ }
+ const __fp16* i18 = (const __fp16*) input[18];
+ assert(i18 != NULL);
+ if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+ i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+ }
+ const __fp16* i19 = (const __fp16*) input[19];
+ assert(i19 != NULL);
+ if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+ i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+ }
+ const __fp16* i20 = (const __fp16*) input[20];
+ assert(i20 != NULL);
+ if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+ i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+ }
+ const __fp16* i21 = (const __fp16*) input[21];
+ assert(i21 != NULL);
+ if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+ i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+ }
+ const __fp16* i22 = (const __fp16*) input[22];
+ assert(i22 != NULL);
+ if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+ i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+ }
+ const __fp16* i23 = (const __fp16*) input[23];
+ assert(i23 != NULL);
+ if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+ i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+ }
+ const __fp16* i24 = (const __fp16*) input[24];
+ assert(i24 != NULL);
+ if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+ i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9);
+ const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10);
+ const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11);
+ const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12);
+ const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13);
+ const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14);
+ const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15);
+ const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16);
+ const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17);
+ const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18);
+ const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19);
+ const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20);
+ const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21);
+ const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22);
+ const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23);
+ const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24);
+ const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
new file mode 100644
index 0000000..08939dc
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x25-minmax-neonfp16arith.c
@@ -0,0 +1,400 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+ const __fp16* i9 = (const __fp16*) input[9];
+ assert(i9 != NULL);
+ if XNN_UNPREDICTABLE(i9 != (const __fp16*) zero) {
+ i9 = (const __fp16*) ((uintptr_t) i9 + input_offset);
+ }
+ const __fp16* i10 = (const __fp16*) input[10];
+ assert(i10 != NULL);
+ if XNN_UNPREDICTABLE(i10 != (const __fp16*) zero) {
+ i10 = (const __fp16*) ((uintptr_t) i10 + input_offset);
+ }
+ const __fp16* i11 = (const __fp16*) input[11];
+ assert(i11 != NULL);
+ if XNN_UNPREDICTABLE(i11 != (const __fp16*) zero) {
+ i11 = (const __fp16*) ((uintptr_t) i11 + input_offset);
+ }
+ const __fp16* i12 = (const __fp16*) input[12];
+ assert(i12 != NULL);
+ if XNN_UNPREDICTABLE(i12 != (const __fp16*) zero) {
+ i12 = (const __fp16*) ((uintptr_t) i12 + input_offset);
+ }
+ const __fp16* i13 = (const __fp16*) input[13];
+ assert(i13 != NULL);
+ if XNN_UNPREDICTABLE(i13 != (const __fp16*) zero) {
+ i13 = (const __fp16*) ((uintptr_t) i13 + input_offset);
+ }
+ const __fp16* i14 = (const __fp16*) input[14];
+ assert(i14 != NULL);
+ if XNN_UNPREDICTABLE(i14 != (const __fp16*) zero) {
+ i14 = (const __fp16*) ((uintptr_t) i14 + input_offset);
+ }
+ const __fp16* i15 = (const __fp16*) input[15];
+ assert(i15 != NULL);
+ if XNN_UNPREDICTABLE(i15 != (const __fp16*) zero) {
+ i15 = (const __fp16*) ((uintptr_t) i15 + input_offset);
+ }
+ const __fp16* i16 = (const __fp16*) input[16];
+ assert(i16 != NULL);
+ if XNN_UNPREDICTABLE(i16 != (const __fp16*) zero) {
+ i16 = (const __fp16*) ((uintptr_t) i16 + input_offset);
+ }
+ const __fp16* i17 = (const __fp16*) input[17];
+ assert(i17 != NULL);
+ if XNN_UNPREDICTABLE(i17 != (const __fp16*) zero) {
+ i17 = (const __fp16*) ((uintptr_t) i17 + input_offset);
+ }
+ const __fp16* i18 = (const __fp16*) input[18];
+ assert(i18 != NULL);
+ if XNN_UNPREDICTABLE(i18 != (const __fp16*) zero) {
+ i18 = (const __fp16*) ((uintptr_t) i18 + input_offset);
+ }
+ const __fp16* i19 = (const __fp16*) input[19];
+ assert(i19 != NULL);
+ if XNN_UNPREDICTABLE(i19 != (const __fp16*) zero) {
+ i19 = (const __fp16*) ((uintptr_t) i19 + input_offset);
+ }
+ const __fp16* i20 = (const __fp16*) input[20];
+ assert(i20 != NULL);
+ if XNN_UNPREDICTABLE(i20 != (const __fp16*) zero) {
+ i20 = (const __fp16*) ((uintptr_t) i20 + input_offset);
+ }
+ const __fp16* i21 = (const __fp16*) input[21];
+ assert(i21 != NULL);
+ if XNN_UNPREDICTABLE(i21 != (const __fp16*) zero) {
+ i21 = (const __fp16*) ((uintptr_t) i21 + input_offset);
+ }
+ const __fp16* i22 = (const __fp16*) input[22];
+ assert(i22 != NULL);
+ if XNN_UNPREDICTABLE(i22 != (const __fp16*) zero) {
+ i22 = (const __fp16*) ((uintptr_t) i22 + input_offset);
+ }
+ const __fp16* i23 = (const __fp16*) input[23];
+ assert(i23 != NULL);
+ if XNN_UNPREDICTABLE(i23 != (const __fp16*) zero) {
+ i23 = (const __fp16*) ((uintptr_t) i23 + input_offset);
+ }
+ const __fp16* i24 = (const __fp16*) input[24];
+ assert(i24 != NULL);
+ if XNN_UNPREDICTABLE(i24 != (const __fp16*) zero) {
+ i24 = (const __fp16*) ((uintptr_t) i24 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9); i9 += 8;
+ const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10); i10 += 8;
+ const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11); i11 += 8;
+ const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12); i12 += 8;
+ const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13); i13 += 8;
+ const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14); i14 += 8;
+ const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15); i15 += 8;
+ const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16); i16 += 8;
+ const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17); i17 += 8;
+ const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18); i18 += 8;
+ const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19); i19 += 8;
+ const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20); i20 += 8;
+ const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21); i21 += 8;
+ const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22); i22 += 8;
+ const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23); i23 += 8;
+ const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24); i24 += 8;
+ const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ const float16x8_t vi9x01234567 = vld1q_f16(i9);
+ const float16x8_t vk9x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi9x01234567, vk9x01234567);
+
+ const float16x8_t vi10x01234567 = vld1q_f16(i10);
+ const float16x8_t vk10x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567);
+
+ const float16x8_t vi11x01234567 = vld1q_f16(i11);
+ const float16x8_t vk11x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi11x01234567, vk11x01234567);
+
+ const float16x8_t vi12x01234567 = vld1q_f16(i12);
+ const float16x8_t vk12x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567);
+
+ const float16x8_t vi13x01234567 = vld1q_f16(i13);
+ const float16x8_t vk13x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi13x01234567, vk13x01234567);
+
+ const float16x8_t vi14x01234567 = vld1q_f16(i14);
+ const float16x8_t vk14x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567);
+
+ const float16x8_t vi15x01234567 = vld1q_f16(i15);
+ const float16x8_t vk15x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi15x01234567, vk15x01234567);
+
+ const float16x8_t vi16x01234567 = vld1q_f16(i16);
+ const float16x8_t vk16x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567);
+
+ const float16x8_t vi17x01234567 = vld1q_f16(i17);
+ const float16x8_t vk17x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi17x01234567, vk17x01234567);
+
+ const float16x8_t vi18x01234567 = vld1q_f16(i18);
+ const float16x8_t vk18x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi18x01234567, vk18x01234567);
+
+ const float16x8_t vi19x01234567 = vld1q_f16(i19);
+ const float16x8_t vk19x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi19x01234567, vk19x01234567);
+
+ const float16x8_t vi20x01234567 = vld1q_f16(i20);
+ const float16x8_t vk20x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi20x01234567, vk20x01234567);
+
+ const float16x8_t vi21x01234567 = vld1q_f16(i21);
+ const float16x8_t vk21x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi21x01234567, vk21x01234567);
+
+ const float16x8_t vi22x01234567 = vld1q_f16(i22);
+ const float16x8_t vk22x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi22x01234567, vk22x01234567);
+
+ const float16x8_t vi23x01234567 = vld1q_f16(i23);
+ const float16x8_t vk23x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi23x01234567, vk23x01234567);
+
+ const float16x8_t vi24x01234567 = vld1q_f16(i24);
+ const float16x8_t vk24x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi24x01234567, vk24x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..6b6c040
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,131 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
new file mode 100644
index 0000000..ae40230
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x4-minmax-neonfp16arith.c
@@ -0,0 +1,127 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
new file mode 100644
index 0000000..da9efa2
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith-acc2.c
@@ -0,0 +1,196 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ float16x8_t vacc01234567p1 = vmulq_f16(vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p1 = vfmaq_f16(vacc01234567p1, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+ // Add up all accumulators to vacc01234567p0
+ vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1);
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
new file mode 100644
index 0000000..e9f60a2
--- /dev/null
+++ b/src/f16-dwconv/gen/up8x9-minmax-neonfp16arith.c
@@ -0,0 +1,192 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-dwconv/up-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ const __fp16* i0 = (const __fp16*) input[0];
+ assert(i0 != NULL);
+ if XNN_UNPREDICTABLE(i0 != (const __fp16*) zero) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_offset);
+ }
+ const __fp16* i1 = (const __fp16*) input[1];
+ assert(i1 != NULL);
+ if XNN_UNPREDICTABLE(i1 != (const __fp16*) zero) {
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_offset);
+ }
+ const __fp16* i2 = (const __fp16*) input[2];
+ assert(i2 != NULL);
+ if XNN_UNPREDICTABLE(i2 != (const __fp16*) zero) {
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_offset);
+ }
+ const __fp16* i3 = (const __fp16*) input[3];
+ assert(i3 != NULL);
+ if XNN_UNPREDICTABLE(i3 != (const __fp16*) zero) {
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_offset);
+ }
+ const __fp16* i4 = (const __fp16*) input[4];
+ assert(i4 != NULL);
+ if XNN_UNPREDICTABLE(i4 != (const __fp16*) zero) {
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_offset);
+ }
+ const __fp16* i5 = (const __fp16*) input[5];
+ assert(i5 != NULL);
+ if XNN_UNPREDICTABLE(i5 != (const __fp16*) zero) {
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_offset);
+ }
+ const __fp16* i6 = (const __fp16*) input[6];
+ assert(i6 != NULL);
+ if XNN_UNPREDICTABLE(i6 != (const __fp16*) zero) {
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_offset);
+ }
+ const __fp16* i7 = (const __fp16*) input[7];
+ assert(i7 != NULL);
+ if XNN_UNPREDICTABLE(i7 != (const __fp16*) zero) {
+ i7 = (const __fp16*) ((uintptr_t) i7 + input_offset);
+ }
+ const __fp16* i8 = (const __fp16*) input[8];
+ assert(i8 != NULL);
+ if XNN_UNPREDICTABLE(i8 != (const __fp16*) zero) {
+ i8 = (const __fp16*) ((uintptr_t) i8 + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7); i7 += 8;
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8); i8 += 8;
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0);
+ const float16x8_t vk0x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567);
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1);
+ const float16x8_t vk1x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567);
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2);
+ const float16x8_t vk2x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3);
+ const float16x8_t vk3x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567);
+
+ const float16x8_t vi4x01234567 = vld1q_f16(i4);
+ const float16x8_t vk4x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567);
+
+ const float16x8_t vi5x01234567 = vld1q_f16(i5);
+ const float16x8_t vk5x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567);
+
+ const float16x8_t vi6x01234567 = vld1q_f16(i6);
+ const float16x8_t vk6x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567);
+
+ const float16x8_t vi7x01234567 = vld1q_f16(i7);
+ const float16x8_t vk7x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567);
+
+ const float16x8_t vi8x01234567 = vld1q_f16(i8);
+ const float16x8_t vk8x01234567 = vld1q_f16(w); w += 8;
+ vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567);
+
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/f16-dwconv/up-neonfp16arith.c.in b/src/f16-dwconv/up-neonfp16arith.c.in
new file mode 100644
index 0000000..d3d613e
--- /dev/null
+++ b/src/f16-dwconv/up-neonfp16arith.c.in
@@ -0,0 +1,154 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 8 == 0
+$assert KERNEL_TILE >= 2
+$assert ACCUMULATORS >= 1
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f16_dwconv_minmax_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__neonfp16arith${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output_ptr,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(channels != 0);
+ assert(output_width != 0);
+
+ __fp16* output = ( __fp16*) output_ptr;
+ const float16x8_t vmax = vld1q_dup_f16(¶ms->max);
+ const float16x8_t vmin = vld1q_dup_f16(¶ms->min);
+ do {
+ $for K in range(KERNEL_TILE):
+ const __fp16* i${K} = (const __fp16*) input[${K}];
+ assert(i${K} != NULL);
+ if XNN_UNPREDICTABLE(i${K} != (const __fp16*) zero) {
+ i${K} = (const __fp16*) ((uintptr_t) i${K} + input_offset);
+ }
+
+ input = (const void**) ((uintptr_t) input + input_stride);
+
+ size_t c = channels;
+ const __fp16* w = (const __fp16*) weights;
+ for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
+ $for C in range(0, CHANNEL_TILE, 8):
+ float16x8_t vacc${ABC[C:C+8]}p0 = vld1q_f16(w); w += 8;
+
+ $for K in range(KERNEL_TILE):
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi${K}x${ABC[C:C+8]} = vld1q_f16(i${K}); i${K} += 8;
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vk${K}x${ABC[C:C+8]} = vld1q_f16(w); w += 8;
+ $for C in range(0, CHANNEL_TILE, 8):
+ $if 1 <= K < ACCUMULATORS:
+ float16x8_t vacc${ABC[C:C+8]}p${K} = vmulq_f16(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]});
+ $else:
+ vacc${ABC[C:C+8]}p${K % ACCUMULATORS} = vfmaq_f16(vacc${ABC[C:C+8]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]});
+
+ $if ACCUMULATORS > 1:
+ // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+ $ACC_STEP = 1
+ $while ACC_STEP < ACCUMULATORS:
+ $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+ $if A + ACC_STEP < ACCUMULATORS:
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]}p${A} = vaddq_f16(vacc${ABC[C:C+8]}p${A}, vacc${ABC[C:C+8]}p${A + ACC_STEP});
+ $ACC_STEP *= 2
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ float16x8_t vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}p0, vmin);
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vst1q_f16(output, vacc${ABC[C:C+8]}); output += 8;
+ }
+ $if CHANNEL_TILE > 8:
+ for (; c >= 8; c -= 8) {
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+
+ $for K in range(KERNEL_TILE):
+
+ const float16x8_t vi${K}x01234567 = vld1q_f16(i${K}); i${K} += 8;
+ const float16x8_t vk${K}x01234567 = vld1q_f16(w + ${(K + 1) * CHANNEL_TILE - 8});
+ $if 1 <= K < ACCUMULATORS:
+ float16x8_t vacc01234567p${K} = vmulq_f16(vi${K}x01234567, vk${K}x01234567);
+ $else:
+ vacc01234567p${K % ACCUMULATORS} = vfmaq_f16(vacc01234567p${K % ACCUMULATORS}, vi${K}x01234567, vk${K}x01234567);
+
+ $if ACCUMULATORS > 1:
+ // Add up all accumulators to vacc01234567p0
+ $ACC_STEP = 1
+ $while ACC_STEP < ACCUMULATORS:
+ $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+ $if A + ACC_STEP < ACCUMULATORS:
+ vacc01234567p${A} = vaddq_f16(vacc01234567p${A}, vacc01234567p${A + ACC_STEP});
+ $ACC_STEP *= 2
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ $if CHANNEL_TILE == 8:
+ float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8;
+ $else:
+ float16x8_t vacc01234567p0 = vld1q_f16(w);
+
+ $for K in range(KERNEL_TILE):
+
+ const float16x8_t vi${K}x01234567 = vld1q_f16(i${K});
+ $if CHANNEL_TILE == 8:
+ const float16x8_t vk${K}x01234567 = vld1q_f16(w); w += 8;
+ $else:
+ const float16x8_t vk${K}x01234567 = vld1q_f16(w + ${(K + 1) * CHANNEL_TILE});
+ $if 1 <= K < ACCUMULATORS:
+ float16x8_t vacc01234567p${K} = vmulq_f16(vi${K}x01234567, vk${K}x01234567);
+ $else:
+ vacc01234567p${K % ACCUMULATORS} = vfmaq_f16(vacc01234567p${K % ACCUMULATORS}, vi${K}x01234567, vk${K}x01234567);
+
+ $if ACCUMULATORS > 1:
+ // Add up all accumulators to vacc01234567p0
+ $ACC_STEP = 1
+ $while ACC_STEP < ACCUMULATORS:
+ $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+ $if A + ACC_STEP < ACCUMULATORS:
+ vacc01234567p${A} = vaddq_f16(vacc01234567p${A}, vacc01234567p${A + ACC_STEP});
+ $ACC_STEP *= 2
+
+ float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (c & 4) {
+ vst1_f16(output, vacc0123); output += 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (c & 2) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_f16(vacc0123), 0); output += 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (c & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output += 1;
+ }
+ }
+
+ output = (__fp16*) ((uintptr_t) output + output_increment);
+ } while (--output_width != 0);
+}
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index ed11f29..d316a24 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -193,6 +193,44 @@
DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_up2x25__scalar)
DECLARE_F32_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_minmax_ukernel_up2x25__scalar_acc2)
+#define DECLARE_F16_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t channels, \
+ size_t output_width, \
+ const void** input, \
+ const void* weights, \
+ void* output, \
+ size_t input_stride, \
+ size_t output_increment, \
+ size_t input_offset, \
+ const void* zero, \
+ const struct xnn_f16_default_params* params);
+
+#define DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t channels, \
+ size_t output_width, \
+ const void** input, \
+ const void* weights, \
+ void* output, \
+ size_t input_stride, \
+ size_t output_increment, \
+ size_t input_offset, \
+ const void* zero, \
+ const struct xnn_f16_minmax_params* params);
+
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith)
+DECLARE_F16_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2)
#define DECLARE_Q8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 952e2cc..2b4e877 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -837,6 +837,18 @@
const float* zero,
const union xnn_f32_minmax_params* params);
+typedef void (*xnn_f16_dwconv_minmax_unipass_ukernel_function)(
+ size_t channels,
+ size_t output_width,
+ const void** input,
+ const void* weights,
+ void* output,
+ size_t input_stride,
+ size_t output_increment,
+ size_t input_offset,
+ const void* zero,
+ const struct xnn_f16_minmax_params* params);
+
typedef void (*xnn_q8_dwconv_minmax_unipass_ukernel_function)(
size_t channels,
size_t output_width,
diff --git a/test/dwconv-microkernel-tester.h b/test/dwconv-microkernel-tester.h
index bbfddff..f81c229 100644
--- a/test/dwconv-microkernel-tester.h
+++ b/test/dwconv-microkernel-tester.h
@@ -20,6 +20,8 @@
#include <random>
#include <vector>
+#include <fp16.h>
+
#include <xnnpack.h>
#include <xnnpack/AlignedAllocator.h>
#include <xnnpack/pack.h>
@@ -283,6 +285,99 @@
}
}
+ void Test(xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv_minmax, Variant variant = Variant::Native) const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
+ auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+ std::vector<const uint16_t*> indirection((width() - 1) * step() + kr());
+ std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + indirection.size() * channels());
+ std::vector<uint16_t> kernel(channels() * kr());
+ std::vector<uint16_t> bias(channels());
+ std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> packed_weights((kr() + 1) * packed_channels());
+ std::vector<uint16_t> zero(channels() + XNN_EXTRA_BYTES / sizeof(uint16_t));
+ std::vector<uint16_t> output((width() - 1) * output_stride() + channels());
+ std::vector<float> output_ref(width() * channels());
+
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ std::generate(input.begin(), input.end(), std::ref(f16rng));
+ std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
+ std::generate(bias.begin(), bias.end(), std::ref(f16rng));
+ std::fill(zero.begin(), zero.end(), 0);
+ std::fill(output_ref.begin(), output_ref.end(), 0.0f);
+ std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
+
+ std::fill(packed_weights.begin(), packed_weights.end(), 0);
+ xnn_pack_f16_dwconv_ghw_w(
+ kr(), 1, channels(), cr(),
+ kernel.data(), bias.data(), packed_weights.data());
+ for (size_t i = 0; i < indirection.size(); i++) {
+ indirection[i] = input.data() + i * channels() - input_offset();
+ }
+ std::shuffle(indirection.begin(), indirection.end(), rng);
+ if (zero_index() != SIZE_MAX) {
+ for (size_t i = 0; i < indirection.size(); i += kr()) {
+ indirection[i + zero_index()] = zero.data();
+ }
+ }
+
+ // Compute reference results, without clamping.
+ for (size_t x = 0; x < width(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ float acc = fp16_ieee_to_fp32_value(bias[c]);
+ for (size_t k = 0; k < kr(); k++) {
+ if (indirection[x * step() + k] != zero.data()) {
+ acc += fp16_ieee_to_fp32_value(indirection[x * step() + k][c + input_offset()]) * fp16_ieee_to_fp32_value(kernel[c * kr() + k]);
+ }
+ }
+ output_ref[x * channels() + c] = acc;
+ }
+ }
+
+ // Compute clamping parameters.
+ const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_range = accumulated_max - accumulated_min;
+ const float output_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
+ const float output_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
+
+ // Prepare parameters.
+ xnn_f16_minmax_params params = xnn_init_f16_minmax_params(
+ fp16_ieee_from_fp32_value(output_min),
+ fp16_ieee_from_fp32_value(output_max));
+
+ // Clamp reference results.
+ for (float& output_val : output_ref) {
+ output_val = std::max(std::min(output_val, output_max), output_min);
+ }
+
+ // Call optimized micro-kernel.
+ dwconv_minmax(
+ channels(), width(),
+ reinterpret_cast<const void**>(indirection.data()), packed_weights.data(), output.data(),
+ step() * sizeof(void*),
+ (output_stride() - channels()) * sizeof(uint16_t),
+ input_offset() * sizeof(uint16_t), zero.data(),
+ ¶ms);
+
+ // Verify results.
+ for (size_t x = 0; x < width(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_GE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_min)
+ << "x = " << x << ", channel = " << c;
+ ASSERT_LE(fp16_ieee_to_fp32_value(output[x * output_stride() + c]), output_max)
+ << "x = " << x << ", channel = " << c;
+ ASSERT_NEAR(
+ output_ref[x * channels() + c],
+ fp16_ieee_to_fp32_value(output[x * output_stride() + c]),
+ std::abs(output_ref[x * channels() + c]) * 1.0e-2)
+ << "x = " << x << ", channel = " << c;
+ }
+ }
+ }
+ }
+
void Test(xnn_f32_dwconv_unipass_ukernel_function dwconv) const {
std::random_device random_device;
auto rng = std::mt19937(random_device());
diff --git a/test/f16-dwconv-minmax.cc b/test/f16-dwconv-minmax.cc
new file mode 100644
index 0000000..06c5f63
--- /dev/null
+++ b/test/f16-dwconv-minmax.cc
@@ -0,0 +1,2252 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/f16-dwconv-minmax.yaml
+// Generator: tools/generate-dwconv-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/dwconv.h>
+#include "dwconv-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(8)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 25; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 25; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(8)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 25; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X25__NEONFP16ARITH_ACC2, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 25; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(25)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(16)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_div_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_div_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_gt_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, c_gt_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= 25; step++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(16)
+ .width(5)
+ .output_stride(83)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .input_offset(304)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 25; mz++) {
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .input_offset(304)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(16)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_div_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_div_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_gt_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, c_gt_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= 25; step++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(16)
+ .width(5)
+ .output_stride(83)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .input_offset(304)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X25__NEONFP16ARITH_ACC2, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 25; mz++) {
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(25)
+ .channels(channels)
+ .input_offset(304)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(8)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 9; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 9; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(8)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 9; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X9__NEONFP16ARITH_ACC2, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 9; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(9)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(16)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_div_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_div_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_gt_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, c_gt_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= 9; step++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(16)
+ .width(5)
+ .output_stride(83)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .input_offset(304)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 9; mz++) {
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .input_offset(304)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(16)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_div_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_div_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_gt_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, c_gt_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= 9; step++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(16)
+ .width(5)
+ .output_stride(83)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .input_offset(304)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X9__NEONFP16ARITH_ACC2, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 9; mz++) {
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(9)
+ .channels(channels)
+ .input_offset(304)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(8)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 4; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(8)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_div_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_div_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_div_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 8; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_gt_8_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, c_gt_8_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 9; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ for (size_t step = 2; step <= 4; step++) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(8)
+ .width(5)
+ .output_stride(43)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 40; channels += 7) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .input_offset(176)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP8X4__NEONFP16ARITH_ACC2, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (uint32_t channels = 16; channels < 128; channels += 24) {
+ DWConvMicrokernelTester()
+ .cr(8)
+ .kr(4)
+ .channels(channels)
+ .input_offset(176)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(16)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_div_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_div_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_gt_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, c_gt_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= 4; step++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(16)
+ .width(5)
+ .output_stride(83)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .input_offset(304)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .input_offset(304)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(16)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_div_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_div_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_div_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 1; channels < 16; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_gt_16_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, c_gt_16_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 17; channels < 32; channels++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_step) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= 4; step++) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .step(step)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(16)
+ .width(5)
+ .output_stride(83)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmin(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, multipixel_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .width(3)
+ .qmax(128)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, input_offset) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .input_offset(304)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+
+ TEST(F16_DWCONV_MINMAX_UP16X4__NEONFP16ARITH_ACC2, zero) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (uint32_t channels = 32; channels < 256; channels += 48) {
+ DWConvMicrokernelTester()
+ .cr(16)
+ .kr(4)
+ .channels(channels)
+ .input_offset(304)
+ .zero_index(mz)
+ .Test(xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
diff --git a/test/f16-dwconv-minmax.yaml b/test/f16-dwconv-minmax.yaml
new file mode 100644
index 0000000..114c736
--- /dev/null
+++ b/test/f16-dwconv-minmax.yaml
@@ -0,0 +1,40 @@
+# Copyright 2020 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith
+ arch:
+ - aarch64
+- name: xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2
+ arch:
+ - aarch64