Generate F16 GAVGPOOL NEONFP16ARITH microkernels from template

PiperOrigin-RevId: 422699809
diff --git a/BUILD.bazel b/BUILD.bazel
index b35381b..f86d2f2 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4385,8 +4385,8 @@
     "src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c",
     "src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c",
     "src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c",
-    "src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c",
-    "src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c",
+    "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c",
+    "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c",
     "src/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c",
     "src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c",
     "src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c",
@@ -4418,8 +4418,14 @@
     "src/f16-dwconv/gen/up32x9-minmax-neonfp16arith.c",
     "src/f16-dwconv/gen/up32x25-minmax-neonfp16arith-acc2.c",
     "src/f16-dwconv/gen/up32x25-minmax-neonfp16arith.c",
-    "src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c",
-    "src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c",
+    "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c",
+    "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c",
+    "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c",
+    "src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c",
+    "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c",
+    "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c",
+    "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c",
+    "src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c",
     "src/f16-gemm/gen-inc/1x8inc-minmax-neonfp16arith-ld64.c",
     "src/f16-gemm/gen-inc/1x16inc-minmax-neonfp16arith-ld64.c",
     "src/f16-gemm/gen-inc/4x8inc-minmax-neonfp16arith-ld64.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7dd0272..bab93ff 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3149,8 +3149,8 @@
   src/f16-dwconv/gen/up8x25-minmax-neonfp16arith-acc2.c
   src/f16-dwconv/gen/up16x4-minmax-neonfp16arith.c
   src/f16-dwconv/gen/up16x9-minmax-neonfp16arith.c
-  src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
-  src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
+  src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
+  src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
   src/f16-gemm/gen/1x16-minmax-neonfp16arith-ld64.c
   src/f16-gemm/gen/6x16-minmax-neonfp16arith-ld64.c
   src/f16-igemm/gen/1x16-minmax-neonfp16arith-ld64.c
@@ -3181,8 +3181,14 @@
   src/f16-dwconv/gen/up32x9-minmax-neonfp16arith.c
   src/f16-dwconv/gen/up32x25-minmax-neonfp16arith-acc2.c
   src/f16-dwconv/gen/up32x25-minmax-neonfp16arith.c
-  src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
-  src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
+  src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
+  src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c
+  src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c
+  src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c
+  src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
+  src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c
+  src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c
+  src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c
   src/f16-gemm/gen-inc/1x8inc-minmax-neonfp16arith-ld64.c
   src/f16-gemm/gen-inc/1x16inc-minmax-neonfp16arith-ld64.c
   src/f16-gemm/gen-inc/4x8inc-minmax-neonfp16arith-ld64.c
diff --git a/scripts/generate-f16-gavgpool.sh b/scripts/generate-f16-gavgpool.sh
new file mode 100755
index 0000000..94a2e2a
--- /dev/null
+++ b/scripts/generate-f16-gavgpool.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# Copyright 2022 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################## ARM NEON ###################################
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=8  -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c &
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c &
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c &
+tools/xngen src/f16-gavgpool/unipass-neonfp16arith.c.in -D ROW_TILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c &
+
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=8  -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c &
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=16 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c &
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=24 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c &
+tools/xngen src/f16-gavgpool/multipass-neonfp16arith.c.in -D ROW_TILE=7 -D ROW_SUBTILE=7 -D CHANNEL_TILE=32 -o src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c &
+
+################################## Unit tests #################################
+tools/generate-gavgpool-test.py --spec test/f16-gavgpool-minmax.yaml --output test/f16-gavgpool-minmax.cc &
+
+wait
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index e047bce..2b89c0b 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -21,10 +21,7 @@
 tools/generate-avgpool-test.py --spec test/f32-avgpool-minmax.yaml --output test/f32-avgpool-minmax.cc &
 
 ### Tests for GAvgPool micro-kernels
-tools/generate-gavgpool-test.py --spec test/f16-gavgpool-minmax.yaml --output test/f16-gavgpool-minmax.cc &
 tools/generate-gavgpool-test.py --spec test/f32-gavgpool-minmax.yaml --output test/f32-gavgpool-minmax.cc &
-tools/generate-gavgpool-test.py --spec test/qu8-gavgpool-minmax-fp32.yaml --output test/qu8-gavgpool-minmax-fp32.cc &
-tools/generate-gavgpool-test.py --spec test/qu8-gavgpool-minmax-rndnu.yaml --output test/qu8-gavgpool-minmax-rndnu.cc &
 
 ### Tests for PAvgPool micro-kernels
 tools/generate-avgpool-test.py --spec test/f32-pavgpool-minmax.yaml --output test/f32-pavgpool-minmax.cc &
diff --git a/src/amalgam/sse2.c b/src/amalgam/sse2.c
index 434c051..1fa7236 100644
--- a/src/amalgam/sse2.c
+++ b/src/amalgam/sse2.c
@@ -5663,7 +5663,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   int32_t* b = buffer;
@@ -9178,7 +9178,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   const __m128i vzero = _mm_setzero_si128();
diff --git a/src/amalgam/sse41.c b/src/amalgam/sse41.c
index fc61f9a..52ba9cb 100644
--- a/src/amalgam/sse41.c
+++ b/src/amalgam/sse41.c
@@ -3896,7 +3896,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
@@ -6617,7 +6617,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
deleted file mode 100644
index 58b2561..0000000
--- a/src/f16-gavgpool/7p7x-minmax-neonfp16arith-c8.c
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gavgpool.h>
-#include <xnnpack/math.h>
-
-
-void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8(
-    size_t rows,
-    size_t channels,
-    const void* input,
-    size_t input_stride,
-    const void* zero,
-    void* buffer,
-    void* output_ptr,
-    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
-  assert(rows > 7);
-  assert(channels != 0);
-
-   __fp16* output = (__fp16*) output_ptr;
-  const __fp16* i0 = (const __fp16*) input;
-  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
-  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
-  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
-  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
-  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
-  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
-  const size_t packed_channels = round_up_po2(channels, 8);
-  const size_t input_increment = 7 * input_stride - packed_channels * sizeof(__fp16);
-
-  __fp16* b = (__fp16*) buffer;
-  for (size_t c = 0; c < channels; c += 8) {
-    const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
-    const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
-    const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
-    const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
-    const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
-    const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
-    const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
-
-    const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
-    const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
-    const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-
-    const float16x8_t vsum016 = vaddq_f16(vsum01, vi6);
-    const float16x8_t vsum2345 = vaddq_f16(vsum23, vsum45);
-
-    const float16x8_t vsum = vaddq_f16(vsum016, vsum2345);
-
-    vst1q_f16(b, vsum); b += 8;
-  }
-  for (rows -= 7; rows > 7; rows -= 7) {
-    b = (__fp16*) buffer;
-
-    i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
-    i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
-    i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
-    i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
-    i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
-    i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
-    i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
-
-    for (size_t c = 0; c < channels; c += 8) {
-      const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
-      const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
-      const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
-      const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
-      const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
-      const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
-      const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
-      const float16x8_t vacc = vld1q_f16(b);
-
-      const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
-      const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
-      const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-      const float16x8_t vsum6a = vaddq_f16(vi6, vacc);
-
-      const float16x8_t vsum0123 = vaddq_f16(vsum01, vsum23);
-      const float16x8_t vsum456a = vaddq_f16(vsum45, vsum6a);
-
-      const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a);
-
-      vst1q_f16(b, vsum); b += 8;
-    }
-  }
-
-  i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
-  i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
-  if (rows < 2) {
-    i1 = (const __fp16*) zero;
-  }
-  i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
-  if (rows <= 2) {
-    i2 = (const __fp16*) zero;
-  }
-  i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
-  if (rows < 4) {
-    i3 = (const __fp16*) zero;
-  }
-  i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
-  if (rows <= 4) {
-    i4 = (const __fp16*) zero;
-  }
-  i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
-  if (rows < 6) {
-    i5 = (const __fp16*) zero;
-  }
-  i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
-  if (rows <= 6) {
-    i6 = (const __fp16*) zero;
-  }
-  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
-  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
-  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
-
-  b = (__fp16*) buffer;
-  while (channels >= 8) {
-    const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
-    const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
-    const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
-    const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
-    const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
-    const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
-    const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
-    const float16x8_t vacc = vld1q_f16(b); b += 8;
-
-    const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
-    const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
-    const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-    const float16x8_t vsum6a = vaddq_f16(vi6, vacc);
-
-    const float16x8_t vsum0123 = vaddq_f16(vsum01, vsum23);
-    const float16x8_t vsum456a = vaddq_f16(vsum45, vsum6a);
-
-    const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a);
-
-    float16x8_t vout = vmulq_f16(vsum, vscale);
-    vout = vmaxq_f16(vout, vmin);
-    vout = vminq_f16(vout, vmax);
-
-    vst1q_f16(output, vout); output += 8;
-
-    channels -= 8;
-  }
-  if (channels != 0) {
-    const float16x8_t vi0 = vld1q_f16(i0);
-    const float16x8_t vi1 = vld1q_f16(i1);
-    const float16x8_t vi2 = vld1q_f16(i2);
-    const float16x8_t vi3 = vld1q_f16(i3);
-    const float16x8_t vi4 = vld1q_f16(i4);
-    const float16x8_t vi5 = vld1q_f16(i5);
-    const float16x8_t vi6 = vld1q_f16(i6);
-    const float16x8_t vacc = vld1q_f16(b);
-
-    const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
-    const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
-    const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-    const float16x8_t vsum6a = vaddq_f16(vi6, vacc);
-
-    const float16x8_t vsum0123 = vaddq_f16(vsum01, vsum23);
-    const float16x8_t vsum456a = vaddq_f16(vsum45, vsum6a);
-
-    const float16x8_t vsum = vaddq_f16(vsum0123, vsum456a);
-
-    float16x8_t vout = vmulq_f16(vsum, vscale);
-    vout = vmaxq_f16(vout, vmin);
-    vout = vminq_f16(vout, vmax);
-
-    float16x4_t vout_lo = vget_low_f16(vout);
-    if (channels & 4) {
-      vst1_f16(output, vout_lo); output += 4;
-      vout_lo = vget_high_f16(vout);
-    }
-    if (channels & 2) {
-      vst1_lane_u32((void*) output, vreinterpret_u32_f16(vout_lo), 0); output += 2;
-      vout_lo = vext_f16(vout_lo, vout_lo, 2);
-    }
-    if (channels & 1) {
-      vst1_lane_f16(output, vout_lo, 0);
-    }
-  }
-}
diff --git a/src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
deleted file mode 100644
index deec5ea..0000000
--- a/src/f16-gavgpool/7x-minmax-neonfp16arith-c8.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gavgpool.h>
-
-
-void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8(
-    size_t rows,
-    size_t channels,
-    const void* input,
-    size_t input_stride,
-    const void* zero,
-    void* output_ptr,
-    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
-{
-  assert(rows != 0);
-  assert(rows <= 7);
-  assert(channels != 0);
-
-   __fp16* output = (__fp16*) output_ptr;
-  const __fp16* i0 = (const __fp16*) input;
-  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
-  if (rows < 2) {
-    i1 = (const __fp16*) zero;
-  }
-  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
-  if (rows <= 2) {
-    i2 = (const __fp16*) zero;
-  }
-  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
-  if (rows < 4) {
-    i3 = (const __fp16*) zero;
-  }
-  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
-  if (rows <= 4) {
-    i4 = (const __fp16*) zero;
-  }
-  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
-  if (rows < 6) {
-    i5 = (const __fp16*) zero;
-  }
-  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
-  if (rows <= 6) {
-    i6 = (const __fp16*) zero;
-  }
-  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
-  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
-  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
-
-  while (channels >= 8) {
-    const float16x8_t vi0 = vld1q_f16(i0); i0 += 8;
-    const float16x8_t vi1 = vld1q_f16(i1); i1 += 8;
-    const float16x8_t vi2 = vld1q_f16(i2); i2 += 8;
-    const float16x8_t vi3 = vld1q_f16(i3); i3 += 8;
-    const float16x8_t vi4 = vld1q_f16(i4); i4 += 8;
-    const float16x8_t vi5 = vld1q_f16(i5); i5 += 8;
-    const float16x8_t vi6 = vld1q_f16(i6); i6 += 8;
-
-    const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
-    const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
-    const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-
-    const float16x8_t vsum016 = vaddq_f16(vsum01, vi6);
-    const float16x8_t vsum2345 = vaddq_f16(vsum23, vsum45);
-
-    const float16x8_t vsum = vaddq_f16(vsum016, vsum2345);
-
-    float16x8_t vout = vmulq_f16(vsum, vscale);
-    vout = vmaxq_f16(vout, vmin);
-    vout = vminq_f16(vout, vmax);
-
-    vst1q_f16(output, vout); output += 8;
-
-    channels -= 8;
-  }
-  if (channels != 0) {
-    const float16x8_t vi0 = vld1q_f16(i0);
-    const float16x8_t vi1 = vld1q_f16(i1);
-    const float16x8_t vi2 = vld1q_f16(i2);
-    const float16x8_t vi3 = vld1q_f16(i3);
-    const float16x8_t vi4 = vld1q_f16(i4);
-    const float16x8_t vi5 = vld1q_f16(i5);
-    const float16x8_t vi6 = vld1q_f16(i6);
-
-    const float16x8_t vsum01 = vaddq_f16(vi0, vi1);
-    const float16x8_t vsum23 = vaddq_f16(vi2, vi3);
-    const float16x8_t vsum45 = vaddq_f16(vi4, vi5);
-
-    const float16x8_t vsum016 = vaddq_f16(vsum01, vi6);
-    const float16x8_t vsum2345 = vaddq_f16(vsum23, vsum45);
-
-    const float16x8_t vsum = vaddq_f16(vsum016, vsum2345);
-
-    float16x8_t vout = vmulq_f16(vsum, vscale);
-    vout = vmaxq_f16(vout, vmin);
-    vout = vminq_f16(vout, vmax);
-
-    float16x4_t vout_lo = vget_low_f16(vout);
-    if (channels & 4) {
-      vst1_f16(output, vout_lo); output += 4;
-      vout_lo = vget_high_f16(vout);
-    }
-    if (channels & 2) {
-      vst1_lane_u32((void*) output, vreinterpret_u32_f16(vout_lo), 0); output += 2;
-      vout_lo = vext_f16(vout_lo, vout_lo, 2);
-    }
-    if (channels & 1) {
-      vst1_lane_f16(output, vout_lo, 0);
-    }
-  }
-}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c
new file mode 100644
index 0000000..85f9ded
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c16.c
@@ -0,0 +1,290 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* buffer,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows > 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+  __fp16* b = buffer;
+  size_t c = channels;
+  for (; c >= 16; c -= 16) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+    vst1q_f16(b, vacc01234567); b += 8;
+    vst1q_f16(b, vacc89ABCDEF); b += 8;
+  }
+  if XNN_UNLIKELY(c != 0) {
+    do {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+
+      c = doz(c, 8);
+    } while (c != 0);
+  }
+
+  for (rows -= 7; rows > 7; rows -= 7) {
+    i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+    i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+    i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+    i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+    i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+    i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+    i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+    __fp16* b = buffer;
+    size_t c = channels;
+    for (; c >= 16; c -= 16) {
+      float16x8_t vacc01234567 = vld1q_f16(b);
+      float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+      vst1q_f16(b, vacc89ABCDEF); b += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      do {
+        float16x8_t vacc01234567 = vld1q_f16(b);
+        const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+        const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+        const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+        const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+        const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+        const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+        const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+        vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+        vst1q_f16(b, vacc01234567); b += 8;
+
+        c = doz(c, 8);
+      } while (c != 0);
+    }
+  }
+
+  i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+  i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 16; channels -= 16) {
+    float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+    float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+    vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+    vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+    vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+    vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    do {
+      float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      if XNN_LIKELY(channels >= 8) {
+        vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+        channels -= 8;
+      } else {
+        float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+        if (channels & 4) {
+          vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+          vacc0123 = vget_high_f16(vacc01234567);
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+          vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+        }
+        channels = 0;
+      }
+    } while (channels != 0);
+  }
+}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c
new file mode 100644
index 0000000..49fb9b6
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c24.c
@@ -0,0 +1,339 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* buffer,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows > 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+  __fp16* b = buffer;
+  size_t c = channels;
+  for (; c >= 24; c -= 24) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+    float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+    vst1q_f16(b, vacc01234567); b += 8;
+    vst1q_f16(b, vacc89ABCDEF); b += 8;
+    vst1q_f16(b, vaccGHIJKLMN); b += 8;
+  }
+  if XNN_UNLIKELY(c != 0) {
+    do {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+
+      c = doz(c, 8);
+    } while (c != 0);
+  }
+
+  for (rows -= 7; rows > 7; rows -= 7) {
+    i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+    i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+    i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+    i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+    i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+    i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+    i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+    __fp16* b = buffer;
+    size_t c = channels;
+    for (; c >= 24; c -= 24) {
+      float16x8_t vacc01234567 = vld1q_f16(b);
+      float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
+      float16x8_t vaccGHIJKLMN = vld1q_f16(b + 16);
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+      const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+      const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+      const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+      const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+      const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+      const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+      vst1q_f16(b, vacc89ABCDEF); b += 8;
+      vst1q_f16(b, vaccGHIJKLMN); b += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      do {
+        float16x8_t vacc01234567 = vld1q_f16(b);
+        const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+        const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+        const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+        const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+        const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+        const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+        const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+        vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+        vst1q_f16(b, vacc01234567); b += 8;
+
+        c = doz(c, 8);
+      } while (c != 0);
+    }
+  }
+
+  i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+  i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 24; channels -= 24) {
+    float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+    float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+    float16x8_t vaccGHIJKLMN = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+    const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+    vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+    vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+    vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+    vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+    vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+    vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+    vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+    vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    do {
+      float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      if XNN_LIKELY(channels >= 8) {
+        vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+        channels -= 8;
+      } else {
+        float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+        if (channels & 4) {
+          vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+          vacc0123 = vget_high_f16(vacc01234567);
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+          vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+        }
+        channels = 0;
+      }
+    } while (channels != 0);
+  }
+}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c
new file mode 100644
index 0000000..7be3ed3
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c32.c
@@ -0,0 +1,388 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* buffer,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows > 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+  __fp16* b = buffer;
+  size_t c = channels;
+  for (; c >= 32; c -= 32) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+    float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+    const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+    float16x8_t vaccOPQRSTUV = vaddq_f16(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+    const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+    const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+    const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+    const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+    vst1q_f16(b, vacc01234567); b += 8;
+    vst1q_f16(b, vacc89ABCDEF); b += 8;
+    vst1q_f16(b, vaccGHIJKLMN); b += 8;
+    vst1q_f16(b, vaccOPQRSTUV); b += 8;
+  }
+  if XNN_UNLIKELY(c != 0) {
+    do {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+
+      c = doz(c, 8);
+    } while (c != 0);
+  }
+
+  for (rows -= 7; rows > 7; rows -= 7) {
+    i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+    i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+    i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+    i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+    i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+    i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+    i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+    __fp16* b = buffer;
+    size_t c = channels;
+    for (; c >= 32; c -= 32) {
+      float16x8_t vacc01234567 = vld1q_f16(b);
+      float16x8_t vacc89ABCDEF = vld1q_f16(b + 8);
+      float16x8_t vaccGHIJKLMN = vld1q_f16(b + 16);
+      float16x8_t vaccOPQRSTUV = vld1q_f16(b + 24);
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+      const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+      const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi0xOPQRSTUV);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+      const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+      const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi1xOPQRSTUV);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+      const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+      const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+      const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+      const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+      const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+      const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+      const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+      const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+      vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+      vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+      vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+      vst1q_f16(b, vacc89ABCDEF); b += 8;
+      vst1q_f16(b, vaccGHIJKLMN); b += 8;
+      vst1q_f16(b, vaccOPQRSTUV); b += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      do {
+        float16x8_t vacc01234567 = vld1q_f16(b);
+        const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+        const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+        const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+        const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+        const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+        const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+        const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+        vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+        vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+        vst1q_f16(b, vacc01234567); b += 8;
+
+        c = doz(c, 8);
+      } while (c != 0);
+    }
+  }
+
+  i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+  i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 32; channels -= 32) {
+    float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+    float16x8_t vacc89ABCDEF = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+    float16x8_t vaccGHIJKLMN = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+    float16x8_t vaccOPQRSTUV = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi0x89ABCDEF);
+    const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi0xGHIJKLMN);
+    const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi0xOPQRSTUV);
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi1xGHIJKLMN);
+    const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi1xOPQRSTUV);
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+    const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+    const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+    const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+    const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+    vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+    vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+    vaccOPQRSTUV = vmulq_f16(vaccOPQRSTUV, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+    vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+    vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+    vaccOPQRSTUV = vmaxq_f16(vaccOPQRSTUV, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+    vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+    vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+    vaccOPQRSTUV = vminq_f16(vaccOPQRSTUV, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+    vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+    vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+    vst1q_f16(output, vaccOPQRSTUV); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    do {
+      float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      if XNN_LIKELY(channels >= 8) {
+        vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+        channels -= 8;
+      } else {
+        float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+        if (channels & 4) {
+          vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+          vacc0123 = vget_high_f16(vacc01234567);
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+          vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+        }
+        channels = 0;
+      }
+    } while (channels != 0);
+  }
+}
diff --git a/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
new file mode 100644
index 0000000..2911e2b
--- /dev/null
+++ b/src/f16-gavgpool/gen/7p7x-minmax-neonfp16arith-c8.c
@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/multipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* buffer,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows > 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+  __fp16* b = buffer;
+  size_t c = channels;
+  for (; c != 0; c = doz(c, 8)) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+    vst1q_f16(b, vacc01234567); b += 8;
+  }
+
+  for (rows -= 7; rows > 7; rows -= 7) {
+    i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+    i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+    i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+    i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+    i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+    i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+    i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+
+    __fp16* b = buffer;
+    size_t c = channels;
+    for (; c != 0; c = doz(c, 8)) {
+      float16x8_t vacc01234567 = vld1q_f16(b);
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vst1q_f16(b, vacc01234567); b += 8;
+    }
+  }
+
+  i0 = (const __fp16*) ((uintptr_t) i0 + input_increment);
+  i1 = (const __fp16*) ((uintptr_t) i1 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  i2 = (const __fp16*) ((uintptr_t) i2 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  i3 = (const __fp16*) ((uintptr_t) i3 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  i4 = (const __fp16*) ((uintptr_t) i4 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  i5 = (const __fp16*) ((uintptr_t) i5 + input_increment);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  i6 = (const __fp16*) ((uintptr_t) i6 + input_increment);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 8; channels -= 8) {
+    float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    {
+      float16x8_t vacc01234567 = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi0x01234567);
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi1x01234567);
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (channels & 4) {
+        vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (channels & 2) {
+        vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (channels & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+      }
+    }
+  }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c
new file mode 100644
index 0000000..7e1754f
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c16.c
@@ -0,0 +1,143 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(rows <= 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 16; channels -= 16) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+    vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+    vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+    vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+    vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    do {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      if XNN_LIKELY(channels >= 8) {
+        vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+        channels -= 8;
+      } else {
+        float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+        if (channels & 4) {
+          vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+          vacc0123 = vget_high_f16(vacc01234567);
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+          vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+        }
+        channels = 0;
+      }
+    } while (channels != 0);
+  }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c
new file mode 100644
index 0000000..65c4aba
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c24.c
@@ -0,0 +1,160 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(rows <= 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 24; channels -= 24) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+    float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+    vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+    vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+    vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+    vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+    vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+    vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+    vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+    vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    do {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      if XNN_LIKELY(channels >= 8) {
+        vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+        channels -= 8;
+      } else {
+        float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+        if (channels & 4) {
+          vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+          vacc0123 = vget_high_f16(vacc01234567);
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+          vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+        }
+        channels = 0;
+      }
+    } while (channels != 0);
+  }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c
new file mode 100644
index 0000000..c54c164
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c32.c
@@ -0,0 +1,177 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(rows <= 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 32; channels -= 32) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0x89ABCDEF = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xGHIJKLMN = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi0xOPQRSTUV = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1x89ABCDEF = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1xGHIJKLMN = vld1q_f16(i1); i1 += 8;
+    const float16x8_t vi1xOPQRSTUV = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+    const float16x8_t vi2x89ABCDEF = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc89ABCDEF = vaddq_f16(vi0x89ABCDEF, vi1x89ABCDEF);
+    const float16x8_t vi2xGHIJKLMN = vld1q_f16(i2); i2 += 8;
+    float16x8_t vaccGHIJKLMN = vaddq_f16(vi0xGHIJKLMN, vi1xGHIJKLMN);
+    const float16x8_t vi2xOPQRSTUV = vld1q_f16(i2); i2 += 8;
+    float16x8_t vaccOPQRSTUV = vaddq_f16(vi0xOPQRSTUV, vi1xOPQRSTUV);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi3x89ABCDEF = vld1q_f16(i3); i3 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi2x89ABCDEF);
+    const float16x8_t vi3xGHIJKLMN = vld1q_f16(i3); i3 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi2xGHIJKLMN);
+    const float16x8_t vi3xOPQRSTUV = vld1q_f16(i3); i3 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi2xOPQRSTUV);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi4x89ABCDEF = vld1q_f16(i4); i4 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi3x89ABCDEF);
+    const float16x8_t vi4xGHIJKLMN = vld1q_f16(i4); i4 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi3xGHIJKLMN);
+    const float16x8_t vi4xOPQRSTUV = vld1q_f16(i4); i4 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi3xOPQRSTUV);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi5x89ABCDEF = vld1q_f16(i5); i5 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi4x89ABCDEF);
+    const float16x8_t vi5xGHIJKLMN = vld1q_f16(i5); i5 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi4xGHIJKLMN);
+    const float16x8_t vi5xOPQRSTUV = vld1q_f16(i5); i5 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi4xOPQRSTUV);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    const float16x8_t vi6x89ABCDEF = vld1q_f16(i6); i6 += 8;
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi5x89ABCDEF);
+    const float16x8_t vi6xGHIJKLMN = vld1q_f16(i6); i6 += 8;
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi5xGHIJKLMN);
+    const float16x8_t vi6xOPQRSTUV = vld1q_f16(i6); i6 += 8;
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi5xOPQRSTUV);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+    vacc89ABCDEF = vaddq_f16(vacc89ABCDEF, vi6x89ABCDEF);
+    vaccGHIJKLMN = vaddq_f16(vaccGHIJKLMN, vi6xGHIJKLMN);
+    vaccOPQRSTUV = vaddq_f16(vaccOPQRSTUV, vi6xOPQRSTUV);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+    vacc89ABCDEF = vmulq_f16(vacc89ABCDEF, vscale);
+    vaccGHIJKLMN = vmulq_f16(vaccGHIJKLMN, vscale);
+    vaccOPQRSTUV = vmulq_f16(vaccOPQRSTUV, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+    vacc89ABCDEF = vmaxq_f16(vacc89ABCDEF, vmin);
+    vaccGHIJKLMN = vmaxq_f16(vaccGHIJKLMN, vmin);
+    vaccOPQRSTUV = vmaxq_f16(vaccOPQRSTUV, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+    vacc89ABCDEF = vminq_f16(vacc89ABCDEF, vmax);
+    vaccGHIJKLMN = vminq_f16(vaccGHIJKLMN, vmax);
+    vaccOPQRSTUV = vminq_f16(vaccOPQRSTUV, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+    vst1q_f16(output, vacc89ABCDEF); output = (__fp16*) output + 8;
+    vst1q_f16(output, vaccGHIJKLMN); output = (__fp16*) output + 8;
+    vst1q_f16(output, vaccOPQRSTUV); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    do {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      if XNN_LIKELY(channels >= 8) {
+        vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+        channels -= 8;
+      } else {
+        float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+        if (channels & 4) {
+          vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+          vacc0123 = vget_high_f16(vacc01234567);
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+          vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+        }
+        channels = 0;
+      }
+    } while (channels != 0);
+  }
+}
diff --git a/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
new file mode 100644
index 0000000..cd95194
--- /dev/null
+++ b/src/f16-gavgpool/gen/7x-minmax-neonfp16arith-c8.c
@@ -0,0 +1,120 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f16-gavgpool/unipass-neonfp16arith.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(rows <= 7);
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  const __fp16* i1 = (const __fp16*) ((uintptr_t) i0 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = (const __fp16*) zero;
+  }
+  const __fp16* i2 = (const __fp16*) ((uintptr_t) i1 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 2) {
+    i2 = (const __fp16*) zero;
+  }
+  const __fp16* i3 = (const __fp16*) ((uintptr_t) i2 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 4) {
+    i3 = (const __fp16*) zero;
+  }
+  const __fp16* i4 = (const __fp16*) ((uintptr_t) i3 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 4) {
+    i4 = (const __fp16*) zero;
+  }
+  const __fp16* i5 = (const __fp16*) ((uintptr_t) i4 + input_stride);
+  if XNN_UNPREDICTABLE(rows < 6) {
+    i5 = (const __fp16*) zero;
+  }
+  const __fp16* i6 = (const __fp16*) ((uintptr_t) i5 + input_stride);
+  if XNN_UNPREDICTABLE(rows <= 6) {
+    i6 = (const __fp16*) zero;
+  }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= 8; channels -= 8) {
+    const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+    const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+    const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+    float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+    const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+    const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+    const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+    const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+    vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+    vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+    vacc01234567 = vmulq_f16(vacc01234567, vscale);
+
+    vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+
+    vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+    vst1q_f16(output, vacc01234567); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    {
+      const float16x8_t vi0x01234567 = vld1q_f16(i0); i0 += 8;
+      const float16x8_t vi1x01234567 = vld1q_f16(i1); i1 += 8;
+
+      const float16x8_t vi2x01234567 = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc01234567 = vaddq_f16(vi0x01234567, vi1x01234567);
+
+      const float16x8_t vi3x01234567 = vld1q_f16(i3); i3 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi2x01234567);
+      const float16x8_t vi4x01234567 = vld1q_f16(i4); i4 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi3x01234567);
+      const float16x8_t vi5x01234567 = vld1q_f16(i5); i5 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi4x01234567);
+      const float16x8_t vi6x01234567 = vld1q_f16(i6); i6 += 8;
+      vacc01234567 = vaddq_f16(vacc01234567, vi5x01234567);
+      vacc01234567 = vaddq_f16(vacc01234567, vi6x01234567);
+
+      vacc01234567 = vmulq_f16(vacc01234567, vscale);
+      vacc01234567 = vmaxq_f16(vacc01234567, vmin);
+      vacc01234567 = vminq_f16(vacc01234567, vmax);
+
+      float16x4_t vacc0123 = vget_low_f16(vacc01234567);
+      if (channels & 4) {
+        vst1_f16(output, vacc0123); output = (__fp16*) output + 4;
+        vacc0123 = vget_high_f16(vacc01234567);
+      }
+      if (channels & 2) {
+        vst1_lane_u32(output, vreinterpret_u32_f16(vacc0123), 0); output = (__fp16*) output + 2;
+        vacc0123 = vext_f16(vacc0123, vacc0123, 2);
+      }
+      if (channels & 1) {
+        vst1_lane_f16(output, vacc0123, 0); output = (__fp16*) output + 1;
+      }
+    }
+  }
+}
diff --git a/src/f16-gavgpool/multipass-neonfp16arith.c.in b/src/f16-gavgpool/multipass-neonfp16arith.c.in
new file mode 100644
index 0000000..3333fc7
--- /dev/null
+++ b/src/f16-gavgpool/multipass-neonfp16arith.c.in
@@ -0,0 +1,205 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 8 == 0
+$assert CHANNEL_TILE >= 8
+$assert ROW_TILE >= 3
+$assert ROW_SUBTILE >= 3
+$assert ROW_SUBTILE <= ROW_TILE
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__neonfp16arith_c${CHANNEL_TILE}(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* buffer,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows > ${ROW_TILE});
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  $for M in range(1, ROW_TILE):
+    const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_stride);
+  const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(__fp16);
+
+  __fp16* b = buffer;
+  size_t c = channels;
+  for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) {
+    $for M in range(2):
+      $for C in range(0, CHANNEL_TILE, 8):
+        const float16x8_t vi${M}x${ABC[C:C+8]} = vld1q_f16(i${M}); i${M} += 8;
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      const float16x8_t vi2x${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+
+    $for M in range(2, ROW_TILE):
+      $for C in range(0, CHANNEL_TILE, 8):
+        $if M + 1 != ROW_TILE:
+          const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+        vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vst1q_f16(b, vacc${ABC[C:C+8]}); b += 8;
+  }
+  $if CHANNEL_TILE > 8:
+    if XNN_UNLIKELY(c != 0) {
+      do {
+        $for M in range(3):
+          const float16x8_t vi${M}x${ABC[0:8]} = vld1q_f16(i${M}); i${M} += 8;
+        float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+
+        $for M in range(2, ROW_TILE):
+          $if M + 1 != ROW_TILE:
+            const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+          vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+        vst1q_f16(b, vacc${ABC[0:8]}); b += 8;
+
+        c = doz(c, 8);
+      } while (c != 0);
+    }
+
+  for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
+    $for M in range(ROW_SUBTILE):
+      i${M} = (const __fp16*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+
+    __fp16* b = buffer;
+    size_t c = channels;
+    for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 8 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 8 else "c = doz(c, %d)") % CHANNEL_TILE}) {
+      float16x8_t vacc${ABC[0:8]} = vld1q_f16(b);
+      $for C in range(8, CHANNEL_TILE, 8):
+        float16x8_t vacc${ABC[C:C+8]} = vld1q_f16(b + ${C});
+
+      $for C in range(0, CHANNEL_TILE, 8):
+        const float16x8_t vi0x${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8;
+
+      $for M in range(ROW_TILE):
+        $for C in range(0, CHANNEL_TILE, 8):
+          $if M + 1 != ROW_TILE:
+            const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+          vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+      $for C in range(0, CHANNEL_TILE, 8):
+        vst1q_f16(b, vacc${ABC[C:C+8]}); b += 8;
+    }
+    $if CHANNEL_TILE > 8:
+      if XNN_UNLIKELY(c != 0) {
+        do {
+          float16x8_t vacc${ABC[0:8]} = vld1q_f16(b);
+          const float16x8_t vi0x${ABC[0:8]} = vld1q_f16(i0); i0 += 8;
+
+          $for M in range(ROW_TILE):
+            $if M + 1 != ROW_TILE:
+              const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+            vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+          vst1q_f16(b, vacc${ABC[0:8]}); b += 8;
+
+          c = doz(c, 8);
+        } while (c != 0);
+      }
+  }
+
+  i0 = (const __fp16*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
+  $for M in range(1, ROW_SUBTILE):
+    i${M} = (const __fp16*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
+    $if M % 2 == 1:
+      if XNN_UNPREDICTABLE(rows < ${M+1}) {
+        i${M} = (const __fp16*) zero;
+      }
+    $else:
+      if XNN_UNPREDICTABLE(rows <= ${M}) {
+        i${M} = (const __fp16*) zero;
+      }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
+    $for C in range(0, CHANNEL_TILE, 8):
+      float16x8_t vacc${ABC[C:C+8]} = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      const float16x8_t vi0x${ABC[C:C+8]} = vld1q_f16(i0); i0 += 8;
+
+    $for M in range(ROW_TILE):
+      $for C in range(0, CHANNEL_TILE, 8):
+        $if M + 1 != ROW_TILE:
+          const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+        vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale);
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin);
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax);
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vst1q_f16(output, vacc${ABC[C:C+8]}); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    ${"do " if CHANNEL_TILE > 8 else ""}{
+      float16x8_t vacc${ABC[0:8]} = vld1q_f16(buffer); buffer = (__fp16*) buffer + 8;
+
+      const float16x8_t vi0x${ABC[0:8]} = vld1q_f16(i0); i0 += 8;
+      $for M in range(ROW_TILE):
+        $if M + 1 != ROW_TILE:
+          const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+        vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+      vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale);
+      vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin);
+      vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax);
+
+      $if CHANNEL_TILE > 8:
+        if XNN_LIKELY(channels >= 8) {
+          vst1q_f16(output, vacc${ABC[0:8]}); output = (__fp16*) output + 8;
+          channels -= 8;
+        } else {
+          float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+          if (channels & 4) {
+            vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+            vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+          }
+          if (channels & 2) {
+            vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+            vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+          }
+          if (channels & 1) {
+            vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+          }
+          channels = 0;
+        }
+      $else:
+        float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+        if (channels & 4) {
+          vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+          vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+          vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+        }
+    }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
+  }
+}
diff --git a/src/f16-gavgpool/unipass-neonfp16arith.c.in b/src/f16-gavgpool/unipass-neonfp16arith.c.in
new file mode 100644
index 0000000..7ea356e
--- /dev/null
+++ b/src/f16-gavgpool/unipass-neonfp16arith.c.in
@@ -0,0 +1,123 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE % 8 == 0
+$assert CHANNEL_TILE >= 8
+$assert ROW_TILE >= 3
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gavgpool.h>
+
+
+void xnn_f16_gavgpool_minmax_ukernel_${ROW_TILE}x__neonfp16arith_c${CHANNEL_TILE}(
+    size_t rows,
+    size_t channels,
+    const void* input,
+    size_t input_stride,
+    const void* zero,
+    void* output,
+    const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
+{
+  assert(rows != 0);
+  assert(rows <= ${ROW_TILE});
+  assert(channels != 0);
+
+  const __fp16* i0 = input;
+  $for M in range(1, ROW_TILE):
+    const __fp16* i${M} = (const __fp16*) ((uintptr_t) i${M-1} + input_stride);
+    $if M % 2 == 1:
+      if XNN_UNPREDICTABLE(rows < ${M+1}) {
+        i${M} = (const __fp16*) zero;
+      }
+    $else:
+      if XNN_UNPREDICTABLE(rows <= ${M}) {
+        i${M} = (const __fp16*) zero;
+      }
+
+  const float16x8_t vscale = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.scale));
+  const float16x8_t vmin = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.min));
+  const float16x8_t vmax = vreinterpretq_f16_u16(vld1q_dup_u16(&params->neon.max));
+  for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) {
+    $for M in range(2):
+      $for C in range(0, CHANNEL_TILE, 8):
+        const float16x8_t vi${M}x${ABC[C:C+8]} = vld1q_f16(i${M}); i${M} += 8;
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      const float16x8_t vi2x${ABC[C:C+8]} = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc${ABC[C:C+8]} = vaddq_f16(vi0x${ABC[C:C+8]}, vi1x${ABC[C:C+8]});
+
+    $for M in range(2, ROW_TILE):
+      $for C in range(0, CHANNEL_TILE, 8):
+        $if M + 1 != ROW_TILE:
+          const float16x8_t vi${M+1}x${ABC[C:C+8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+        vacc${ABC[C:C+8]} = vaddq_f16(vacc${ABC[C:C+8]}, vi${M}x${ABC[C:C+8]});
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vacc${ABC[C:C+8]} = vmulq_f16(vacc${ABC[C:C+8]}, vscale);
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vacc${ABC[C:C+8]} = vmaxq_f16(vacc${ABC[C:C+8]}, vmin);
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vacc${ABC[C:C+8]} = vminq_f16(vacc${ABC[C:C+8]}, vmax);
+
+    $for C in range(0, CHANNEL_TILE, 8):
+      vst1q_f16(output, vacc${ABC[C:C+8]}); output = (__fp16*) output + 8;
+  }
+  if XNN_UNLIKELY(channels != 0) {
+    ${"do " if CHANNEL_TILE > 8 else ""}{
+      $for M in range(2):
+        const float16x8_t vi${M}x${ABC[0:8]} = vld1q_f16(i${M}); i${M} += 8;
+
+      const float16x8_t vi2x${ABC[0:8]} = vld1q_f16(i2); i2 += 8;
+      float16x8_t vacc${ABC[0:8]} = vaddq_f16(vi0x${ABC[0:8]}, vi1x${ABC[0:8]});
+
+      $for M in range(2, ROW_TILE):
+        $if M + 1 != ROW_TILE:
+          const float16x8_t vi${M+1}x${ABC[0:8]} = vld1q_f16(i${M+1}); i${M+1} += 8;
+        vacc${ABC[0:8]} = vaddq_f16(vacc${ABC[0:8]}, vi${M}x${ABC[0:8]});
+
+      vacc${ABC[0:8]} = vmulq_f16(vacc${ABC[0:8]}, vscale);
+      vacc${ABC[0:8]} = vmaxq_f16(vacc${ABC[0:8]}, vmin);
+      vacc${ABC[0:8]} = vminq_f16(vacc${ABC[0:8]}, vmax);
+
+      $if CHANNEL_TILE > 8:
+        if XNN_LIKELY(channels >= 8) {
+          vst1q_f16(output, vacc${ABC[0:8]}); output = (__fp16*) output + 8;
+          channels -= 8;
+        } else {
+          float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+          if (channels & 4) {
+            vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+            vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+          }
+          if (channels & 2) {
+            vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+            vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+          }
+          if (channels & 1) {
+            vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+          }
+          channels = 0;
+        }
+      $else:
+        float16x4_t vacc${ABC[0:4]} = vget_low_f16(vacc${ABC[0:8]});
+        if (channels & 4) {
+          vst1_f16(output, vacc${ABC[0:4]}); output = (__fp16*) output + 4;
+          vacc${ABC[0:4]} = vget_high_f16(vacc${ABC[0:8]});
+        }
+        if (channels & 2) {
+          vst1_lane_u32(output, vreinterpret_u32_f16(vacc${ABC[0:4]}), 0); output = (__fp16*) output + 2;
+          vacc${ABC[0:4]} = vext_f16(vacc${ABC[0:4]}, vacc${ABC[0:4]}, 2);
+        }
+        if (channels & 1) {
+          vst1_lane_f16(output, vacc${ABC[0:4]}, 0); output = (__fp16*) output + 1;
+        }
+    }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
+  }
+}
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
index d77caf3..970e99c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
index 3b3a21f..e65d7e2 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
index 97c9ad8..2843191 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
index fb14ee1..2b5053c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
index 3d382c7..5c3992f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
@@ -36,7 +36,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
index fda1318..e460ba5 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
@@ -36,7 +36,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
index 2f6dc8a..7ce7b0f 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
@@ -36,7 +36,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
index 3618ffd..54ddbe3 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
@@ -36,7 +36,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
index cdd0162..c230b48 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
index c0160de..bf878be 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
index 0b7516e..6683e0b 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
index 4eed122..4259059 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
index 2eea1f9..5e885f8 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
index 30ee417..e8f2342 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
index c8d2a6e..18367b6 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
@@ -34,7 +34,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
index 003b08d..4846fe0 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
@@ -34,7 +34,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
index a3fba3c..9c03aa4 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
@@ -34,7 +34,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
index 854275f..52b0ef5 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
index c2000cd..078ebc1 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
index 81c4eaa..0a1de3b 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
index 25fb554..a0dc98c 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
index 048c8c8..f4141b1 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
index ad41d96..a7997e6 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
index d86b070..cec1a35 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
index 379b363..15cbc8a 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
index 152aaf3..4e81051 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
index 57680e7..6a010d7 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
index f6881f3..47fe5c8 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
index 43e60ca..3f1a5b9 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
index ad7a0c3..f4f0b6b 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
index 61b0b28..aeb7a55 100644
--- a/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
+++ b/src/qs8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
@@ -35,7 +35,7 @@
   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(int8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-neon.c.in b/src/qs8-gavgpool/multipass-neon.c.in
index 66d5212..3178d90 100644
--- a/src/qs8-gavgpool/multipass-neon.c.in
+++ b/src/qs8-gavgpool/multipass-neon.c.in
@@ -63,9 +63,9 @@
   $for M in range(1, ROW_TILE):
     const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
   $if CHANNEL_TILE <= 16:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
   $else:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->${PARAMS_STRUCT}.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-scalar.c.in b/src/qs8-gavgpool/multipass-scalar.c.in
index aa7c335..da74dd4 100644
--- a/src/qs8-gavgpool/multipass-scalar.c.in
+++ b/src/qs8-gavgpool/multipass-scalar.c.in
@@ -41,10 +41,7 @@
   const ${XINT8_T}* i0 = input;
   $for M in range(1, ROW_TILE):
     const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
-  $if CHANNEL_TILE <= 16:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
-  $else:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
 
   const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias;
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-sse2.c.in b/src/qs8-gavgpool/multipass-sse2.c.in
index d554063..3d319f2 100644
--- a/src/qs8-gavgpool/multipass-sse2.c.in
+++ b/src/qs8-gavgpool/multipass-sse2.c.in
@@ -38,9 +38,9 @@
   $for M in range(1, ROW_TILE):
     const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
   $if CHANNEL_TILE <= 16:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
   $else:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   $if DATATYPE == "QU8":
diff --git a/src/qs8-gavgpool/multipass-sse4.c.in b/src/qs8-gavgpool/multipass-sse4.c.in
index 7d0d5ff..a71058b 100644
--- a/src/qs8-gavgpool/multipass-sse4.c.in
+++ b/src/qs8-gavgpool/multipass-sse4.c.in
@@ -41,9 +41,9 @@
   $for M in range(1, ROW_TILE):
     const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
   $if CHANNEL_TILE <= 16:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
   $else:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qs8-gavgpool/multipass-wasmsimd.c.in b/src/qs8-gavgpool/multipass-wasmsimd.c.in
index 0a2f3aa..c69100a 100644
--- a/src/qs8-gavgpool/multipass-wasmsimd.c.in
+++ b/src/qs8-gavgpool/multipass-wasmsimd.c.in
@@ -42,9 +42,9 @@
   $for M in range(1, ROW_TILE):
     const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride);
   $if CHANNEL_TILE <= 16:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T});
   $else:
-    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
+    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T});
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
index db34fc5..8d178b8 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c16.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
index c9ec0a7..45ce07c 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c24.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
index c4f4c28..2b1ed99 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c32.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
index 78b89eb..eebd024 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neon-c8.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
index d9989f1..bf10e2a 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c16.c
@@ -36,7 +36,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
index d3a230f..b5d7882 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c24.c
@@ -36,7 +36,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
index 5b2031d..ab74d07 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c32.c
@@ -36,7 +36,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
index d9d5c65..c6136af 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-neonv8-c8.c
@@ -36,7 +36,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->fp32_neonv8.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
index 8b0b708..bd3c5c2 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c1.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
index 28b98a0..1d0e215 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c2.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
index a063456..f0e9929 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-fmagic-c4.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
index ca19e5b..793492c 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c1.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
index 1fba58a..87af55d 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c2.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
index 4b44df1..1d79de9 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-imagic-c4.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
index 8b8717a..dcf293e 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c1.c
@@ -34,7 +34,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 1) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
index 867ccad..b292217 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c2.c
@@ -34,7 +34,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
index a87d252..9e81485 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-scalar-lrintf-c4.c
@@ -34,7 +34,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
 
   const int32_t vinit_bias = params->fp32_scalar_lrintf.init_bias;
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
index d86d99b..69a52c5 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c16.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   const __m128i vzero = _mm_setzero_si128();
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
index 9d4dfb8..3d8a2bb 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c24.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   const __m128i vzero = _mm_setzero_si128();
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
index 6986c1d..5331476 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse2-c8.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse2.init_bias);
   const __m128i vzero = _mm_setzero_si128();
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
index 8f91fdd..a078432 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c16.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
index 8d5a9fe..e342669 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c24.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
index f7fae60..8c67640 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-sse41-c8.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const __m128i vinit_bias = _mm_load_si128((const __m128i*) params->fp32_sse4.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
index 0825b54..715c0e3 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c16.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
index 4d444ed..ff39ee4 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c24.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
index 74363c4..4474f15 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c32.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
index a8aa3d9..b31f576 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-fp32-wasmsimd-c8.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
index 7f72be9..941c613 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c16.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 16) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
index 782659a..d214f99 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c24.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
index 3c2d0b4..c5406f0 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c32.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
index cf78a44..4ce1a6e 100644
--- a/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
+++ b/src/qu8-gavgpool/gen/7p7x-minmax-rndnu-neon-c8.c
@@ -35,7 +35,7 @@
   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
-  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
+  const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint8_t);
 
   const int32x4_t vinit_bias = vld1q_dup_s32(&params->rndnu_neon.init_bias);
   int32_t* b = buffer;
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index f45e9d2..a242a28 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -68,6 +68,9 @@
       const union xnn_f16_scaleminmax_params* params);
 
 DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8)
+DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16)
+DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24)
+DECLARE_F16_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32)
 
 
 #define DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(fn_name) \
@@ -81,6 +84,9 @@
       const union xnn_f16_scaleminmax_params* params);
 
 DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8)
+DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16)
+DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24)
+DECLARE_F16_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32)
 
 
 #define DECLARE_QS8_GAVGPOOL_MINMAX_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
diff --git a/test/f16-gavgpool-minmax.cc b/test/f16-gavgpool-minmax.cc
index d3a6962..0a80ded 100644
--- a/test/f16-gavgpool-minmax.cc
+++ b/test/f16-gavgpool-minmax.cc
@@ -179,6 +179,480 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(16)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 1; rows < 7; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(16)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(16)
+      .input_stride(19)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(16)
+      .qmax(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_eq_16_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(16)
+      .qmin(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_div_16_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_lt_16_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C16, channels_gt_16_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(24)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 1; rows < 7; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(24)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(24)
+      .input_stride(29)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(24)
+      .qmax(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_eq_24_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(24)
+      .qmin(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_div_24_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 48; channels < 192; channels += 24) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_div_24_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 48; channels < 192; channels += 24) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_lt_24_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C24, channels_gt_24_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(32)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 1; rows < 7; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(32)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(32)
+      .input_stride(37)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(32)
+      .qmax(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_eq_32_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(7)
+      .channels(32)
+      .qmin(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_div_32_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 64; channels < 256; channels += 32) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_div_32_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 64; channels < 256; channels += 32) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_lt_32_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      for (size_t rows = 1; rows < 7; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7X__NEONFP16ARITH_C32, channels_gt_32_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(7)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C8, channels_eq_8_2pass_fulltile) {
     TEST_REQUIRES_ARM_NEON_FP16_ARITH;
     GAvgPoolMicrokernelTester()
@@ -441,3 +915,798 @@
     }
   }
 #endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(16)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(16)
+      .input_stride(19)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(16)
+      .qmax(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(16)
+      .qmin(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 8; rows < 14; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(16)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_2pass_subtile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 8; rows < 14; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(16)
+        .input_stride(19)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 14; rows <= 35; rows += 7) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(16)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_eq_16_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 14; rows <= 35; rows += 7) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(16)
+        .input_stride(19)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_div_16_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(263)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_lt_16_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 16; channels++) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(19)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      for (size_t rows = 14; rows < 35; rows += 14) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C16, channels_gt_16_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 17; channels < 32; channels++) {
+      for (size_t rows = 14; rows < 35; rows += 14) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(47)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(24)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(24)
+      .input_stride(29)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(24)
+      .qmax(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(24)
+      .qmin(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 8; rows < 14; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(24)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_2pass_subtile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 8; rows < 14; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(24)
+        .input_stride(29)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 14; rows <= 35; rows += 7) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(24)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_eq_24_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 14; rows <= 35; rows += 7) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(24)
+        .input_stride(29)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 48; channels < 192; channels += 24) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 48; channels < 192; channels += 24) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 48; channels < 192; channels += 24) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_div_24_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 48; channels < 192; channels += 24) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(389)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_lt_24_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 24; channels++) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(29)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      for (size_t rows = 14; rows < 35; rows += 14) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C24, channels_gt_24_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 25; channels < 48; channels++) {
+      for (size_t rows = 14; rows < 35; rows += 14) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(61)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(32)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(32)
+      .input_stride(37)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(32)
+      .qmax(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    GAvgPoolMicrokernelTester()
+      .rows(14)
+      .channels(32)
+      .qmin(128)
+      .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 8; rows < 14; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(32)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_2pass_subtile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 8; rows < 14; rows++) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(32)
+        .input_stride(37)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 14; rows <= 35; rows += 7) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(32)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_eq_32_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t rows = 14; rows <= 35; rows += 7) {
+      GAvgPoolMicrokernelTester()
+        .rows(rows)
+        .channels(32)
+        .input_stride(37)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 64; channels < 256; channels += 32) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 64; channels < 256; channels += 32) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 64; channels < 256; channels += 32) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_div_32_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 64; channels < 256; channels += 32) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(521)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_lt_32_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 1; channels < 32; channels++) {
+      for (size_t rows = 14; rows <= 35; rows += 7) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(37)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      GAvgPoolMicrokernelTester()
+        .rows(14)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_2pass_subtile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      for (size_t rows = 8; rows < 14; rows++) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_multipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      for (size_t rows = 14; rows < 35; rows += 14) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+
+  TEST(F16_GAVGPOOL_MINMAX_7P7X__NEONFP16ARITH_C32, channels_gt_32_multipass_fulltile_with_input_stride) {
+    TEST_REQUIRES_ARM_NEON_FP16_ARITH;
+    for (size_t channels = 33; channels < 64; channels++) {
+      for (size_t rows = 14; rows < 35; rows += 14) {
+        GAvgPoolMicrokernelTester()
+          .rows(rows)
+          .channels(channels)
+          .input_stride(79)
+          .Test(xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32, xnn_init_f16_scaleminmax_neon_params);
+      }
+    }
+  }
+#endif  // XNN_ARCH_ARM64
diff --git a/test/f16-gavgpool-minmax.yaml b/test/f16-gavgpool-minmax.yaml
index 14ffc00..224fd1b 100644
--- a/test/f16-gavgpool-minmax.yaml
+++ b/test/f16-gavgpool-minmax.yaml
@@ -6,7 +6,31 @@
   init: xnn_init_f16_scaleminmax_neon_params
   arch:
     - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c16
+  init: xnn_init_f16_scaleminmax_neon_params
+  arch:
+    - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c24
+  init: xnn_init_f16_scaleminmax_neon_params
+  arch:
+    - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c32
+  init: xnn_init_f16_scaleminmax_neon_params
+  arch:
+    - aarch64
 - name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8
   init: xnn_init_f16_scaleminmax_neon_params
   arch:
     - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c16
+  init: xnn_init_f16_scaleminmax_neon_params
+  arch:
+    - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c24
+  init: xnn_init_f16_scaleminmax_neon_params
+  arch:
+    - aarch64
+- name: xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c32
+  init: xnn_init_f16_scaleminmax_neon_params
+  arch:
+    - aarch64