Generate F16 GAVGPOOL NEONFP16ARITH microkernels from template
PiperOrigin-RevId: 422699809
diff --git a/BUILD.bazel b/BUILD.bazel
index b35381b..f86d2f2 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4385,8 +4385,8 @@
"src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c",
"src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c",
"src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c",
- "src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c",
- "src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c",
+ "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c",
+ "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c",
"src/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c",
"src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
"src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
@@ -4418,8 +4418,14 @@
"src/f16-dwconv/gen/up32x9-minmax-neonfp16arith.c",
"src/f16-dwconv/gen/up32x25-minmax-neonfp16arith-acc2.c",
"src/f16-dwconv/gen/up32x25-minmax-neonfp16arith.c",
- "src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c",
- "src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c",
+ "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c",
+ "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c",
+ "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c",
+ "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c",
+ "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c",
+ "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c",
+ "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c",
+ "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c",
"src/f16-gemm/gen-inc/1x8inc-minmax-neonfp16arith-ld64.c",
"src/f16-gemm/gen-inc/1x16inc-minmax-neonfp16arith-ld64.c",
"src/f16-gemm/gen-inc/4x8inc-minmax-neonfp16arith-ld64.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7dd0272..bab93ff 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3149,8 +3149,8 @@
src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
- src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
- src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
+ src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
+ src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
src/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c
src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c
@@ -3181,8 +3181,14 @@
src/f16-dwconv/gen/up32x9-minmax-neonfp16arith.c
src/f16-dwconv/gen/up32x25-minmax-neonfp16arith-acc2.c
src/f16-dwconv/gen/up32x25-minmax-neonfp16arith.c
- src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
- src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
+ src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
+ src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c
+ src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c
+ src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c
+ src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
+ src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c
+ src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c
+ src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c
src/f16-gemm/gen-inc/1x8inc-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen-inc/1x16inc-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen-inc/4x8inc-minmax-neonfp16arith-ld64.c
diff --git a/scripts/generate-f16-gavgpool.sh b/scripts/generate-f16-gavgpool.sh
new file mode 100755
index 0000000..94a2e2a
--- /dev/null
+++ b/scripts/generate-f16-gavgpool.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################## ARM NEON ###################################
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c &
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c &
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c &
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c &
+
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c &
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c &
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c &
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c &
+
+################################## Unit tests #################################
+tools/generate-gavgpool-test.py --spec test/f16-gavgpool-minmax.yaml --output test/f16-gavgpool-minmax.cc &
+
+wait
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index e047bce..2b89c0b 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -21,10 +21,7 @@
tools/generate-avgpool-test.py --spec test/f32-avgpool-minmax.yaml --output test/f32-avgpool-minmax.cc &
### Tests for GAvgPool micro-kernels
-tools/generate-gavgpool-test.py --spec test/f16-gavgpool-minmax.yaml --output test/f16-gavgpool-minmax.cc &
tools/generate-gavgpool-test.py --spec test/f32-gavgpool-minmax.yaml --output test/f32-gavgpool-minmax.cc &
-tools/generate-gavgpool-test.py --spec test/qu8-gavgpool-minmax-fp32.yaml --output test/qu8-gavgpool-minmax-fp32.cc &
-tools/generate-gavgpool-test.py --spec test/qu8-gavgpool-minmax-rndnu.yaml --output test/qu8-gavgpool-minmax-rndnu.cc &
### Tests for PAvgPool micro-kernels
tools/generate-avgpool-test.py --spec test/f32-pavgpool-minmax.yaml --output test/f32-pavgpool-minmax.cc &
diff --git a/src/amalgam/sse2.c b/src/amalgam/sse2.c
index 434c051..1fa7236 100644
--- a/src/amalgam/sse2.c
+++ b/src/amalgam/sse2.c
@@ -5663,7 +5663,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
int32_t* b = buffer;
@@ -9178,7 +9178,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
const __m128i vzero = _mm_setzero_si128();
diff --git a/src/amalgam/sse41.c b/src/amalgam/sse41.c
index fc61f9a..52ba9cb 100644
--- a/src/amalgam/sse41.c
+++ b/src/amalgam/sse41.c
@@ -3896,7 +3896,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
@@ -6617,7 +6617,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
deleted file mode 100644
index 58b2561..0000000
--- a/src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gavgpool.h>
-#include <xnnpack/math.h>
-
-
-void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8(
- size_t rows,
- size_t channels,
- const void* input,
- size_t input_stride,
- const void* zero,
- void* buffer,
- void* output_ptr,
- const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
- assert(rows > 7);
- assert(channels != 0);
-
- __fp16* output = (__fp16*) output_ptr;
- const __fp16* i0 = (const __fp16*) input;
- const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
- const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
- const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
- const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
- const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
- const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
- const size_t packed_channels = round_up_po2(channels, 8);
- const size_t input_increment = 7 * input_stride - packed_channels * sizeof(__fp16);
-
- __fp16* b = (__fp16*) buffer;
- for (size_t c = 0; c < channels; c += 8) {
- const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
- const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
- const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
- const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
- const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
- const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
- const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
-
- const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
- const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
- const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-
- const float16x8_t vsum016 = vaddq_f16(vsum01, vi6);
- const float16x8_t vsum2345 = vaddq_f16(vsum23, vsum45);
-
- const float16x8_t vsum = vaddq_f16(vsum016, vsum2345);
-
- vst1q_f16(b, vsum); b += 8;
- }
- for (rows -= 7; rows > 7; rows -= 7) {
- b = (__fp16*) buffer;
-
- i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
- i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
- i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
- i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
- i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
- i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
- i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
-
- for (size_t c = 0; c < channels; c += 8) {
- const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
- const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
- const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
- const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
- const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
- const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
- const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
- const float16x8_t vacc = vld1q_f16(b);
-
- const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
- const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
- const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
- const float16x8_t vsum6a = vaddq_f16(vi6, vacc);
-
- const float16x8_t vsum0123 = vaddq_f16(vsum01, vsum23);
- const float16x8_t vsum456a = vaddq_f16(vsum45, vsum6a);
-
- const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a);
-
- vst1q_f16(b, vsum); b += 8;
- }
- }
-
- i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
- i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
- if (rows < 2) {
- i1 = (const __fp16*) zero;
- }
- i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
- if (rows <= 2) {
- i2 = (const __fp16*) zero;
- }
- i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
- if (rows < 4) {
- i3 = (const __fp16*) zero;
- }
- i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
- if (rows <= 4) {
- i4 = (const __fp16*) zero;
- }
- i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
- if (rows < 6) {
- i5 = (const __fp16*) zero;
- }
- i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
- if (rows <= 6) {
- i6 = (const __fp16*) zero;
- }
- const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
- const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
- const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
-
- b = (__fp16*) buffer;
- while (channels >= 8) {
- const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
- const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
- const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
- const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
- const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
- const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
- const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
- const float16x8_t vacc = vld1q_f16(b); b += 8;
-
- const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
- const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
- const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
- const float16x8_t vsum6a = vaddq_f16(vi6, vacc);
-
- const float16x8_t vsum0123 = vaddq_f16(vsum01, vsum23);
- const float16x8_t vsum456a = vaddq_f16(vsum45, vsum6a);
-
- const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a);
-
- float16x8_t vout = vmulq_f16(vsum, vscale);
- vout = vmaxq_f16(vout, vmin);
- vout = vminq_f16(vout, vmax);
-
- vst1q_f16(output, vout); output += 8;
-
- channels -= 8;
- }
- if (channels != 0) {
- const float16x8_t vi0 = vld1q_f16(i0);
- const float16x8_t vi1 = vld1q_f16(i1);
- const float16x8_t vi2 = vld1q_f16(i2);
- const float16x8_t vi3 = vld1q_f16(i3);
- const float16x8_t vi4 = vld1q_f16(i4);
- const float16x8_t vi5 = vld1q_f16(i5);
- const float16x8_t vi6 = vld1q_f16(i6);
- const float16x8_t vacc = vld1q_f16(b);
-
- const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
- const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
- const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
- const float16x8_t vsum6a = vaddq_f16(vi6, vacc);
-
- const float16x8_t vsum0123 = vaddq_f16(vsum01, vsum23);
- const float16x8_t vsum456a = vaddq_f16(vsum45, vsum6a);
-
- const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a);
-
- float16x8_t vout = vmulq_f16(vsum, vscale);
- vout = vmaxq_f16(vout, vmin);
- vout = vminq_f16(vout, vmax);
-
- float16x4_t vout_lo = vget_low_f16(vout);
- if (channels & 4) {
- vst1_f16(output, vout_lo); output += 4;
- vout_lo = vget_high_f16(vout);
- }
- if (channels & 2) {
- vst1_lane_u32((void*) output, vreinterpret_u32_f16(vout_lo), 0); output += 2;
- vout_lo = vext_f16(vout_lo, vout_lo, 2);
- }
- if (channels & 1) {
- vst1_lane_f16(output, vout_lo, 0);
- }
- }
-}
diff --git a/src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
deleted file mode 100644
index deec5ea..0000000
--- a/src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gavgpool.h>
-
-
-void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8(
- size_t rows,
- size_t channels,
- const void* input,
- size_t input_stride,
- const void* zero,
- void* output_ptr,
- const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
- assert(rows != 0);
- assert(rows <= 7);
- assert(channels != 0);
-
- __fp16* output = (__fp16*) output_ptr;
- const __fp16* i0 = (const __fp16*) input;
- const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
- if (rows < 2) {
- i1 = (const __fp16*) zero;
- }
- const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
- if (rows <= 2) {
- i2 = (const __fp16*) zero;
- }
- const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
- if (rows < 4) {
- i3 = (const __fp16*) zero;
- }
- const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
- if (rows <= 4) {
- i4 = (const __fp16*) zero;
- }
- const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
- if (rows < 6) {
- i5 = (const __fp16*) zero;
- }
- const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
- if (rows <= 6) {
- i6 = (const __fp16*) zero;
- }
- const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
- const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
- const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
-
- while (channels >= 8) {
- const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
- const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
- const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
- const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
- const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
- const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
- const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
-
- const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
- const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
- const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-
- const float16x8_t vsum016 = vaddq_f16(vsum01, vi6);
- const float16x8_t vsum2345 = vaddq_f16(vsum23, vsum45);
-
- const float16x8_t vsum = vaddq_f16(vsum016, vsum2345);
-
- float16x8_t vout = vmulq_f16(vsum, vscale);
- vout = vmaxq_f16(vout, vmin);
- vout = vminq_f16(vout, vmax);
-
- vst1q_f16(output, vout); output += 8;
-
- channels -= 8;
- }
- if (channels != 0) {
- const float16x8_t vi0 = vld1q_f16(i0);
- const float16x8_t vi1 = vld1q_f16(i1);
- const float16x8_t vi2 = vld1q_f16(i2);
- const float16x8_t vi3 = vld1q_f16(i3);
- const float16x8_t vi4 = vld1q_f16(i4);
- const float16x8_t vi5 = vld1q_f16(i5);
- const float16x8_t vi6 = vld1q_f16(i6);
-
- const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
- const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
- const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-
- const float16x8_t vsum016 = vaddq_f16(vsum01, vi6);
- const float16x8_t vsum2345 = vaddq_f16(vsum23, vsum45);
-
- const float16x8_t vsum = vaddq_f16(vsum016, vsum2345);
-
- float16x8_t vout = vmulq_f16(vsum, vscale);
- vout = vmaxq_f16(vout, vmin);
- vout = vminq_f16(vout, vmax);
-
- float16x4_t vout_lo = vget_low_f16(vout);
- if (channels & 4) {
- vst1_f16(output, vout_lo); output += 4;
- vout_lo = vget_high_f16(vout);
- }
- if (channels & 2) {
- vst1_lane_u32((void*) output, vreinterpret_u32_f16(vout_lo), 0); output += 2;
- vout_lo = vext_f16(vout_lo, vout_lo, 2);
- }
- if (channels & 1) {
- vst1_lane_f16(output, vout_lo, 0);
- }
- }
-}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c
new file mode 100644
index 0000000..85f9ded
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c
@@ -0,0 +1,290 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* buffer,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c >= 16; c -= 16) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ vst1q_f16(b, vacc89ABCDEF); b += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c >= 16; c -= 16) {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+ float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ vst1q_f16(b, vacc89ABCDEF); b += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 16; channels -= 16) {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+ float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c
new file mode 100644
index 0000000..49fb9b6
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c
@@ -0,0 +1,339 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* buffer,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ vst1q_f16(b, vacc89ABCDEF); b += 8;
+ vst1q_f16(b, vaccGHIJKLMN); b += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c >= 24; c -= 24) {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+ float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
+ float16x8_t vaccGHIJKLMN = vld1q_f16(b + 16);
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ vst1q_f16(b, vacc89ABCDEF); b += 8;
+ vst1q_f16(b, vaccGHIJKLMN); b += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 24; channels -= 24) {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+ float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+ float16x8_t vaccGHIJKLMN = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+ vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+ vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+ vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+ vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c
new file mode 100644
index 0000000..7be3ed3
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c
@@ -0,0 +1,388 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* buffer,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+ float16x8_t vaccOPQRSTUV = vaddq_f16(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ vst1q_f16(b, vacc89ABCDEF); b += 8;
+ vst1q_f16(b, vaccGHIJKLMN); b += 8;
+ vst1q_f16(b, vaccOPQRSTUV); b += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c >= 32; c -= 32) {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+ float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
+ float16x8_t vaccGHIJKLMN = vld1q_f16(b + 16);
+ float16x8_t vaccOPQRSTUV = vld1q_f16(b + 24);
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+ const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi0xOPQRSTUV);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+ const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi1xOPQRSTUV);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ vst1q_f16(b, vacc89ABCDEF); b += 8;
+ vst1q_f16(b, vaccGHIJKLMN); b += 8;
+ vst1q_f16(b, vaccOPQRSTUV); b += 8;
+ }
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 32; channels -= 32) {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+ float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+ float16x8_t vaccGHIJKLMN = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+ float16x8_t vaccOPQRSTUV = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+ const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi0xOPQRSTUV);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+ const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi1xOPQRSTUV);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+ vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+ vaccOPQRSTUV = vmulq_f16(vaccOPQRSTUV, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+ vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+ vaccOPQRSTUV = vmaxq_f16(vaccOPQRSTUV, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+ vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+ vaccOPQRSTUV = vminq_f16(vaccOPQRSTUV, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+ vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+ vst1q_f16(output, vaccOPQRSTUV); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
new file mode 100644
index 0000000..2911e2b
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* buffer,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ }
+
+ for (rows -= 7; rows > 7; rows -= 7) {
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; c != 0; c = doz(c, 8)) {
+ float16x8_t vacc01234567 = vld1q_f16(b);
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vst1q_f16(b, vacc01234567); b += 8;
+ }
+ }
+
+ i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+ i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 8; channels -= 8) {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ }
+ }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c
new file mode 100644
index 0000000..7e1754f
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c
@@ -0,0 +1,143 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 16; channels -= 16) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c
new file mode 100644
index 0000000..65c4aba
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c
@@ -0,0 +1,160 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 24; channels -= 24) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+ vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+ vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+ vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+ vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c
new file mode 100644
index 0000000..c54c164
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c
@@ -0,0 +1,177 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 32; channels -= 32) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+ const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+ const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+ const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+ float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+ const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+ float16x8_t vaccOPQRSTUV = vaddq_f16(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+ const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+ const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+ const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+ const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+ const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+ const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+ const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+ const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+ vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+ vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+ vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+ vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+ vaccOPQRSTUV = vmulq_f16(vaccOPQRSTUV, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+ vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+ vaccOPQRSTUV = vmaxq_f16(vaccOPQRSTUV, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+ vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+ vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+ vaccOPQRSTUV = vminq_f16(vaccOPQRSTUV, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+ vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+ vst1q_f16(output, vaccOPQRSTUV); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ do {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ } while (channels != 0);
+ }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
new file mode 100644
index 0000000..cd95194
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
@@ -0,0 +1,120 @@
+// Auto-generated file. Do not edit!
+// Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+// Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= 7);
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 2) {
+ i1 = (const __fp16*) zero;
+ }
+ const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 2) {
+ i2 = (const __fp16*) zero;
+ }
+ const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 4) {
+ i3 = (const __fp16*) zero;
+ }
+ const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 4) {
+ i4 = (const __fp16*) zero;
+ }
+ const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+ if XNN_UNPREDICTABLE(rows < 6) {
+ i5 = (const __fp16*) zero;
+ }
+ const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+ if XNN_UNPREDICTABLE(rows <= 6) {
+ i6 = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= 8; channels -= 8) {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ {
+ const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+ const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+ const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+ const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+ const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+ const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+ const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+ vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+ vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+ vacc01234567 = vmulq_f16(vacc01234567, vscale);
+ vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+ vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+ float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+ if (channels & 4) {
+ vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+ vacc0123 = vget_high_f16(vacc01234567);
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+ vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+ }
+ }
+ }
+}
diff --git a/src/f16-gavgpool/multipass-neonfp16arith.c.in b/src/f16-gavgpool/multipass-neonfp16arith.c.in
new file mode 100644
index 0000000..3333fc7
--- /dev/null
+++ b/src/f16-gavgpool/multipass-neonfp16arith.c.in
@@ -0,0 +1,205 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 8 == 0
+$assert CHANNEL_TILE >= 8
+$assert ROW_TILE >= 3
+$assert ROW_SUBTILE >= 3
+$assert ROW_SUBTILE <= ROW_TILE
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__neonfp16arith_c${CHANNEL_TILE}(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* buffer,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows > ${ROW_TILE});
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ $for M in range(1, ROW_TILE):
+ const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_stride);
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) {
+ $for M in range(2):
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi${M}x${ABC[C:C+8]} = vld1q_f16(i${M}); i${M} += 8;
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi2x${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+
+ $for M in range(2, ROW_TILE):
+ $for C in range(0, CHANNEL_TILE, 8):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vst1q_f16(b, vacc${ABC[C:C+8]}); b += 8;
+ }
+ $if CHANNEL_TILE > 8:
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ $for M in range(3):
+ const float16x8_t vi${M}x${ABC[0:8]} = vld1q_f16(i${M}); i${M} += 8;
+ float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+
+ $for M in range(2, ROW_TILE):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+ vst1q_f16(b, vacc${ABC[0:8]}); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+
+ for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
+ $for M in range(ROW_SUBTILE):
+ i${M} = (const __fp16*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+
+ __fp16* b = buffer;
+ size_t c = channels;
+ for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) {
+ float16x8_t vacc${ABC[0:8]} = vld1q_f16(b);
+ $for C in range(8, CHANNEL_TILE, 8):
+ float16x8_t vacc${ABC[C:C+8]} = vld1q_f16(b + ${C});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi0x${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8;
+
+ $for M in range(ROW_TILE):
+ $for C in range(0, CHANNEL_TILE, 8):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vst1q_f16(b, vacc${ABC[C:C+8]}); b += 8;
+ }
+ $if CHANNEL_TILE > 8:
+ if XNN_UNLIKELY(c != 0) {
+ do {
+ float16x8_t vacc${ABC[0:8]} = vld1q_f16(b);
+ const float16x8_t vi0x${ABC[0:8]} = vld1q_f16(i0); i0 += 8;
+
+ $for M in range(ROW_TILE):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+ vst1q_f16(b, vacc${ABC[0:8]}); b += 8;
+
+ c = doz(c, 8);
+ } while (c != 0);
+ }
+ }
+
+ i0 = (const __fp16*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+ $for M in range(1, ROW_SUBTILE):
+ i${M} = (const __fp16*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+ $if M % 2 == 1:
+ if XNN_UNPREDICTABLE(rows < ${M+1}) {
+ i${M} = (const __fp16*) zero;
+ }
+ $else:
+ if XNN_UNPREDICTABLE(rows <= ${M}) {
+ i${M} = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
+ $for C in range(0, CHANNEL_TILE, 8):
+ float16x8_t vacc${ABC[C:C+8]} = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi0x${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8;
+
+ $for M in range(ROW_TILE):
+ $for C in range(0, CHANNEL_TILE, 8):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vst1q_f16(output, vacc${ABC[C:C+8]}); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ ${"do " if CHANNEL_TILE > 8 else ""}{
+ float16x8_t vacc${ABC[0:8]} = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+ const float16x8_t vi0x${ABC[0:8]} = vld1q_f16(i0); i0 += 8;
+ $for M in range(ROW_TILE):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+ vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale);
+ vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin);
+ vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax);
+
+ $if CHANNEL_TILE > 8:
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc${ABC[0:8]}); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+ if (channels & 4) {
+ vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+ vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+ vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ $else:
+ float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+ if (channels & 4) {
+ vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+ vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+ vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+ }
+ }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
+ }
+}
diff --git a/src/f16-gavgpool/unipass-neonfp16arith.c.in b/src/f16-gavgpool/unipass-neonfp16arith.c.in
new file mode 100644
index 0000000..7ea356e
--- /dev/null
+++ b/src/f16-gavgpool/unipass-neonfp16arith.c.in
@@ -0,0 +1,123 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 8 == 0
+$assert CHANNEL_TILE >= 8
+$assert ROW_TILE >= 3
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}x__neonfp16arith_c${CHANNEL_TILE}(
+ size_t rows,
+ size_t channels,
+ const void* input,
+ size_t input_stride,
+ const void* zero,
+ void* output,
+ const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+ assert(rows != 0);
+ assert(rows <= ${ROW_TILE});
+ assert(channels != 0);
+
+ const __fp16* i0 = input;
+ $for M in range(1, ROW_TILE):
+ const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_stride);
+ $if M % 2 == 1:
+ if XNN_UNPREDICTABLE(rows < ${M+1}) {
+ i${M} = (const __fp16*) zero;
+ }
+ $else:
+ if XNN_UNPREDICTABLE(rows <= ${M}) {
+ i${M} = (const __fp16*) zero;
+ }
+
+ const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.scale));
+ const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.min));
+ const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(¶ms->neon.max));
+ for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
+ $for M in range(2):
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi${M}x${ABC[C:C+8]} = vld1q_f16(i${M}); i${M} += 8;
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ const float16x8_t vi2x${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+
+ $for M in range(2, ROW_TILE):
+ $for C in range(0, CHANNEL_TILE, 8):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax);
+
+ $for C in range(0, CHANNEL_TILE, 8):
+ vst1q_f16(output, vacc${ABC[C:C+8]}); output = (__fp16*) output + 8;
+ }
+ if XNN_UNLIKELY(channels != 0) {
+ ${"do " if CHANNEL_TILE > 8 else ""}{
+ $for M in range(2):
+ const float16x8_t vi${M}x${ABC[0:8]} = vld1q_f16(i${M}); i${M} += 8;
+
+ const float16x8_t vi2x${ABC[0:8]} = vld1q_f16(i2); i2 += 8;
+ float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+
+ $for M in range(2, ROW_TILE):
+ $if M + 1 != ROW_TILE:
+ const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+ vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+ vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale);
+ vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin);
+ vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax);
+
+ $if CHANNEL_TILE > 8:
+ if XNN_LIKELY(channels >= 8) {
+ vst1q_f16(output, vacc${ABC[0:8]}); output = (__fp16*) output + 8;
+ channels -= 8;
+ } else {
+ float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+ if (channels & 4) {
+ vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+ vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+ vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+ }
+ channels = 0;
+ }
+ $else:
+ float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+ if (channels & 4) {
+ vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+ vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+ }
+ if (channels & 2) {
+ vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+ vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+ }
+ if (channels & 1) {
+ vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+ }
+ }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
+ }
+}
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
index d77caf3..970e99c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
index 3b3a21f..e65d7e2 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
index 97c9ad8..2843191 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
index fb14ee1..2b5053c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
index 3d382c7..5c3992f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
@@ -36,7 +36,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
index fda1318..e460ba5 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
@@ -36,7 +36,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
index 2f6dc8a..7ce7b0f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
@@ -36,7 +36,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
index 3618ffd..54ddbe3 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
@@ -36,7 +36,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
index cdd0162..c230b48 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
index c0160de..bf878be 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
index 0b7516e..6683e0b 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
index 4eed122..4259059 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
index 2eea1f9..5e885f8 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
index 30ee417..e8f2342 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
index c8d2a6e..18367b6 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
@@ -34,7 +34,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
index 003b08d..4846fe0 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
@@ -34,7 +34,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
index a3fba3c..9c03aa4 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
@@ -34,7 +34,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
index 854275f..52b0ef5 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
index c2000cd..078ebc1 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
index 81c4eaa..0a1de3b 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
index 25fb554..a0dc98c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
index 048c8c8..f4141b1 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
index ad41d96..a7997e6 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
index d86b070..cec1a35 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
index 379b363..15cbc8a 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
index 152aaf3..4e81051 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
index 57680e7..6a010d7 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
index f6881f3..47fe5c8 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
index 43e60ca..3f1a5b9 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
index ad7a0c3..f4f0b6b 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
index 61b0b28..aeb7a55 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
@@ -35,7 +35,7 @@
const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-neon.c.in b/src/qs8-gavgpool/multipass-neon.c.in
index 66d5212..3178d90 100644
--- a/src/qs8-gavgpool/multipass-neon.c.in
+++ b/src/qs8-gavgpool/multipass-neon.c.in
@@ -63,9 +63,9 @@
$for M in range(1, ROW_TILE):
const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
$else:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-scalar.c.in b/src/qs8-gavgpool/multipass-scalar.c.in
index aa7c335..da74dd4 100644
--- a/src/qs8-gavgpool/multipass-scalar.c.in
+++ b/src/qs8-gavgpool/multipass-scalar.c.in
@@ -41,10 +41,7 @@
const ${XINT8_T}* i0 = input;
$for M in range(1, ROW_TILE):
const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
- $if CHANNEL_TILE <= 16:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
- $else:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias;
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-sse2.c.in b/src/qs8-gavgpool/multipass-sse2.c.in
index d554063..3d319f2 100644
--- a/src/qs8-gavgpool/multipass-sse2.c.in
+++ b/src/qs8-gavgpool/multipass-sse2.c.in
@@ -38,9 +38,9 @@
$for M in range(1, ROW_TILE):
const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
$else:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
$if DATATYPE == "QU8":
diff --git a/src/qs8-gavgpool/multipass-sse4.c.in b/src/qs8-gavgpool/multipass-sse4.c.in
index 7d0d5ff..a71058b 100644
--- a/src/qs8-gavgpool/multipass-sse4.c.in
+++ b/src/qs8-gavgpool/multipass-sse4.c.in
@@ -41,9 +41,9 @@
$for M in range(1, ROW_TILE):
const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
$else:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-wasmsimd.c.in b/src/qs8-gavgpool/multipass-wasmsimd.c.in
index 0a2f3aa..c69100a 100644
--- a/src/qs8-gavgpool/multipass-wasmsimd.c.in
+++ b/src/qs8-gavgpool/multipass-wasmsimd.c.in
@@ -42,9 +42,9 @@
$for M in range(1, ROW_TILE):
const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
$if CHANNEL_TILE <= 16:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
$else:
- const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
index db34fc5..8d178b8 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
index c9ec0a7..45ce07c 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
index c4f4c28..2b1ed99 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
index 78b89eb..eebd024 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
index d9989f1..bf10e2a 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
@@ -36,7 +36,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
index d3a230f..b5d7882 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
@@ -36,7 +36,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
index 5b2031d..ab74d07 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
@@ -36,7 +36,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
index d9d5c65..c6136af 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
@@ -36,7 +36,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->fp32_neonv8.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
index 8b0b708..bd3c5c2 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
index 28b98a0..1d0e215 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
index a063456..f0e9929 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
index ca19e5b..793492c 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
index 1fba58a..87af55d 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
index 4b44df1..1d79de9 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
index 8b8717a..dcf293e 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
@@ -34,7 +34,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
index 867ccad..b292217 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
@@ -34,7 +34,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
index a87d252..9e81485 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
@@ -34,7 +34,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
index d86d99b..69a52c5 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
const __m128i vzero = _mm_setzero_si128();
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
index 9d4dfb8..3d8a2bb 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
const __m128i vzero = _mm_setzero_si128();
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
index 6986c1d..5331476 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
const __m128i vzero = _mm_setzero_si128();
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
index 8f91fdd..a078432 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
index 8d5a9fe..e342669 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
index f7fae60..8c67640 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
index 0825b54..715c0e3 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
index 4d444ed..ff39ee4 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
index 74363c4..4474f15 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
index a8aa3d9..b31f576 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
index 7f72be9..941c613 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
index 782659a..d214f99 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
index 3c2d0b4..c5406f0 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
index cf78a44..4ce1a6e 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
@@ -35,7 +35,7 @@
const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
- const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+ const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
const int32x4_t vinit_bias = vld1q_dup_s32(¶ms->rndnu_neon.init_bias);
int32_t* b = buffer;
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index f45e9d2..a242a28 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -68,6 +68,9 @@
const union xnn_f16_scaleminmax_params* params);
DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8)
+DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16)
+DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24)
+DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32)
#define DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
@@ -81,6 +84,9 @@
const union xnn_f16_scaleminmax_params* params);
DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8)
+DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16)
+DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24)
+DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32)
#define DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
diff --git a/test/f16-gavgpool-minmax.cc b/test/f16-gavgpool-minmax.cc
index d3a6962..0a80ded 100644
--- a/test/f16-gavgpool-minmax.cc
+++ b/test/f16-gavgpool-minmax.cc
@@ -179,6 +179,480 @@
#if XNN_ARCH_ARM64
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(16)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(16)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(16)
+ .input_stride(19)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(16)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(16)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_div_16_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(24)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(24)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(24)
+ .input_stride(29)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(24)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(24)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_div_24_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 48; channels < 192; channels += 24) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_div_24_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 48; channels < 192; channels += 24) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(32)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(32)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(32)
+ .input_stride(37)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(32)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(32)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_div_32_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 64; channels < 256; channels += 32) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_div_32_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 64; channels < 256; channels += 32) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ for (size_t rows = 1; rows < 7; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(7)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_fulltile) {
TEST_REQUIRES_ARM_NEON_FP16_ARITH;
GAvgPoolMicrokernelTester()
@@ -441,3 +915,798 @@
}
}
#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(16)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(16)
+ .input_stride(19)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(16)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(16)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(16)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_subtile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(16)
+ .input_stride(19)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(16)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(16)
+ .input_stride(19)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(263)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 16; channels++) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(19)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ for (size_t rows = 14; rows < 35; rows += 14) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 17; channels < 32; channels++) {
+ for (size_t rows = 14; rows < 35; rows += 14) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(47)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(24)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(24)
+ .input_stride(29)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(24)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(24)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(24)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_subtile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(24)
+ .input_stride(29)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(24)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(24)
+ .input_stride(29)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 48; channels < 192; channels += 24) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 48; channels < 192; channels += 24) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 48; channels < 192; channels += 24) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 48; channels < 192; channels += 24) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(389)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 24; channels++) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(29)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ for (size_t rows = 14; rows < 35; rows += 14) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 25; channels < 48; channels++) {
+ for (size_t rows = 14; rows < 35; rows += 14) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(61)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(32)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(32)
+ .input_stride(37)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(32)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(32)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(32)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_subtile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(32)
+ .input_stride(37)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(32)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(32)
+ .input_stride(37)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 64; channels < 256; channels += 32) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 64; channels < 256; channels += 32) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 64; channels < 256; channels += 32) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 64; channels < 256; channels += 32) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(521)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 1; channels < 32; channels++) {
+ for (size_t rows = 14; rows <= 35; rows += 7) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(37)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmax(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ GAvgPoolMicrokernelTester()
+ .rows(14)
+ .channels(channels)
+ .qmin(128)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_subtile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ for (size_t rows = 8; rows < 14; rows++) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_multipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ for (size_t rows = 14; rows < 35; rows += 14) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+
+ TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_multipass_fulltile_with_input_stride) {
+ TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+ for (size_t channels = 33; channels < 64; channels++) {
+ for (size_t rows = 14; rows < 35; rows += 14) {
+ GAvgPoolMicrokernelTester()
+ .rows(rows)
+ .channels(channels)
+ .input_stride(79)
+ .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+ }
+ }
+ }
+#endif // XNN_ARCH_ARM64
diff --git a/test/f16-gavgpool-minmax.yaml b/test/f16-gavgpool-minmax.yaml
index 14ffc00..224fd1b 100644
--- a/test/f16-gavgpool-minmax.yaml
+++ b/test/f16-gavgpool-minmax.yaml
@@ -6,7 +6,31 @@
init: xnn_init_f16_scaleminmax_neon_params
arch:
- aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16
+ init: xnn_init_f16_scaleminmax_neon_params
+ arch:
+ - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24
+ init: xnn_init_f16_scaleminmax_neon_params
+ arch:
+ - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32
+ init: xnn_init_f16_scaleminmax_neon_params
+ arch:
+ - aarch64
- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8
init: xnn_init_f16_scaleminmax_neon_params
arch:
- aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16
+ init: xnn_init_f16_scaleminmax_neon_params
+ arch:
+ - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24
+ init: xnn_init_f16_scaleminmax_neon_params
+ arch:
+ - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32
+ init: xnn_init_f16_scaleminmax_neon_params
+ arch:
+ - aarch64