Refactor MaxPool and ArgMaxPool micro-kernels

- Support input_offset argument in MaxPool and ArgMaxPool micro-kernels
- Use input_offset to make indirection buffer independent on batch size
- Simplify and auto-generate unit tests
- Use more descriptive names for micro-kernel parameters

PiperOrigin-RevId: 281447682
diff --git a/BUILD.bazel b/BUILD.bazel
index 463b36a..31f569d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -74,9 +74,9 @@
 ]
 
 SCALAR_UKERNELS = [
-    "src/f32-argmaxpool/mp9p8q-scalar.c",
-    "src/f32-argmaxpool/up4-scalar.c",
-    "src/f32-argmaxpool/up9-scalar.c",
+    "src/f32-argmaxpool/9p8x-scalar-c1.c",
+    "src/f32-argmaxpool/4x-scalar-c1.c",
+    "src/f32-argmaxpool/9x-scalar-c1.c",
     "src/f32-avgpool/mp9p8q-scalar.c",
     "src/f32-avgpool/up9-scalar.c",
     "src/f32-bilinear/scalar-c1.c",
@@ -136,7 +136,7 @@
     "src/f32-gemminc/2x4-scalar.c",
     "src/f32-gemminc/4x4-scalar.c",
     "src/f32-hswish/scalar.c",
-    "src/f32-maxpool/9p8q-scalar.c",
+    "src/f32-maxpool/9p8x-scalar-c1.c",
     "src/f32-pavgpool/mp9p8q-scalar.c",
     "src/f32-pavgpool/up9-scalar.c",
     "src/f32-ppmm/2x4-scalar.c",
@@ -169,7 +169,7 @@
     "src/q8-vadd/scalar.c",
     "src/u8-clamp/scalar.c",
     "src/u8-lut32norm/scalar.c",
-    "src/u8-maxpool/9p8q-scalar.c",
+    "src/u8-maxpool/9p8x-scalar-c1.c",
     "src/u8-rmax/scalar.c",
     "src/x32-packx/x2-scalar.c",
     "src/x32-packx/x3-scalar.c",
@@ -188,9 +188,9 @@
 ]
 
 PSIMD_UKERNELS = [
-    "src/f32-argmaxpool/mp9p8q-psimd.c",
-    "src/f32-argmaxpool/up4-psimd.c",
-    "src/f32-argmaxpool/up9-psimd.c",
+    "src/f32-argmaxpool/9p8x-psimd-c4.c",
+    "src/f32-argmaxpool/4x-psimd-c4.c",
+    "src/f32-argmaxpool/9x-psimd-c4.c",
     "src/f32-avgpool/mp9p8q-psimd.c",
     "src/f32-avgpool/up9-psimd.c",
     "src/f32-bilinear/psimd-c4.c",
@@ -253,7 +253,7 @@
     "src/f32-gemminc/6x8-psimd-splat.c",
     "src/f32-gemminc/6x8s4-psimd.c",
     "src/f32-hswish/psimd.c",
-    "src/f32-maxpool/9p8q-psimd.c",
+    "src/f32-maxpool/9p8x-psimd-c4.c",
     "src/f32-pavgpool/mp9p8q-psimd.c",
     "src/f32-pavgpool/up9-psimd.c",
     "src/f32-ppmm/4x8-psimd.c",
@@ -337,7 +337,7 @@
     "src/q8-igemm/8x8-neon.c",
     "src/q8-vadd/neon.c",
     "src/u8-clamp/neon.c",
-    "src/u8-maxpool/9p8q-neon.c",
+    "src/u8-maxpool/9p8x-neon-c16.c",
     "src/u8-rmax/neon.c",
     "src/x32-packx/x4-neon-st4.c",
     "src/x32-pad/x2-neon.c",
@@ -476,7 +476,7 @@
     "src/f32-gemminc/4x8-sse-load1.c",
     "src/f32-gemminc/4x8s4-sse.c",
     "src/f32-hswish/sse.c",
-    "src/f32-maxpool/9p8q-sse.c",
+    "src/f32-maxpool/9p8x-sse-c4.c",
     "src/f32-pavgpool/mp9p8q-sse.c",
     "src/f32-pavgpool/up9-sse.c",
     "src/f32-dwconv-spchw/3x3p1-sse.c",
@@ -491,9 +491,9 @@
 ]
 
 SSE2_UKERNELS = [
-    "src/f32-argmaxpool/mp9p8q-sse2.c",
-    "src/f32-argmaxpool/up4-sse2.c",
-    "src/f32-argmaxpool/up9-sse2.c",
+    "src/f32-argmaxpool/9p8x-sse2-c4.c",
+    "src/f32-argmaxpool/4x-sse2-c4.c",
+    "src/f32-argmaxpool/9x-sse2-c4.c",
     "src/f32-prelu/sse2-2x4.c",
     "src/f32-prelu/sse2-2x8.c",
     "src/f32-sigmoid/sse2-p5-div-x8.c",
@@ -508,7 +508,7 @@
     "src/q8-gemm/4x4c2-sse2.c",
     "src/q8-vadd/sse2.c",
     "src/u8-clamp/sse2.c",
-    "src/u8-maxpool/9p8q-sse2.c",
+    "src/u8-maxpool/9p8x-sse2-c16.c",
     "src/u8-rmax/sse2.c",
     "src/x32-pad/x2-sse2.c",
     "src/x32-zip/x2-sse2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62d732c..4cdae83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,7 +225,7 @@
   src/f32-gemminc/2x4-scalar.c
   src/f32-gemminc/4x4-scalar.c
   src/f32-hswish/scalar.c
-  src/f32-maxpool/9p8q-scalar.c
+  src/f32-maxpool/9p8x-scalar-c1.c
   src/f32-pavgpool/mp9p8q-scalar.c
   src/f32-pavgpool/up9-scalar.c
   src/f32-ppmm/2x4-scalar.c
@@ -258,7 +258,7 @@
   src/q8-vadd/scalar.c
   src/u8-clamp/scalar.c
   src/u8-lut32norm/scalar.c
-  src/u8-maxpool/9p8q-scalar.c
+  src/u8-maxpool/9p8x-scalar-c1.c
   src/u8-rmax/scalar.c
   src/x32-packx/x2-scalar.c
   src/x32-packx/x3-scalar.c
@@ -341,7 +341,7 @@
   src/f32-gemminc/6x8-psimd-splat.c
   src/f32-gemminc/6x8s4-psimd.c
   src/f32-hswish/psimd.c
-  src/f32-maxpool/9p8q-psimd.c
+  src/f32-maxpool/9p8x-psimd-c4.c
   src/f32-pavgpool/mp9p8q-psimd.c
   src/f32-pavgpool/up9-psimd.c
   src/f32-ppmm/4x8-psimd.c
@@ -423,7 +423,7 @@
   src/q8-gemm/8x8-neon.c
   src/q8-vadd/neon.c
   src/u8-clamp/neon.c
-  src/u8-maxpool/9p8q-neon.c
+  src/u8-maxpool/9p8x-neon-c16.c
   src/u8-rmax/neon.c
   src/x32-packx/x4-neon-st4.c
   src/x32-pad/x2-neon.c
@@ -558,7 +558,7 @@
   src/f32-gemminc/4x8-sse-load1.c
   src/f32-gemminc/4x8s4-sse.c
   src/f32-hswish/sse.c
-  src/f32-maxpool/9p8q-sse.c
+  src/f32-maxpool/9p8x-sse-c4.c
   src/f32-pavgpool/mp9p8q-sse.c
   src/f32-pavgpool/up9-sse.c
   src/f32-dwconv-spchw/3x3p1-sse.c
@@ -589,7 +589,7 @@
   src/q8-gemm/4x4c2-sse2.c
   src/q8-vadd/sse2.c
   src/u8-clamp/sse2.c
-  src/u8-maxpool/9p8q-sse2.c
+  src/u8-maxpool/9p8x-sse2-c16.c
   src/u8-rmax/sse2.c
   src/x32-pad/x2-sse2.c
   src/x32-zip/x2-sse2.c
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index 821f456..7c03298 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -12,3 +12,9 @@
 ### Tests for packing micro-kernels
 tools/generate-pack-test.py --spec test/x32-packx.yaml --output test/x32-packx.cc
 
+### Tests for MaxPool micro-kernels
+tools/generate-maxpool-test.py --spec test/u8-maxpool.yaml --output test/u8-maxpool.cc
+tools/generate-maxpool-test.py --spec test/f32-maxpool.yaml --output test/f32-maxpool.cc
+
+### Tests for ArgMaxPool micro-kernels
+tools/generate-argmaxpool-test.py --spec test/f32-argmaxpool.yaml --output test/f32-argmaxpool.cc
diff --git a/src/argmax-pooling-nhwc.c b/src/argmax-pooling-nhwc.c
index e4c359a..7c02073 100644
--- a/src/argmax-pooling-nhwc.c
+++ b/src/argmax-pooling-nhwc.c
@@ -208,20 +208,6 @@
   argmax_pooling_op->output_width = compute_output_dimension(
       argmax_pooling_op->padding_left + input_width + argmax_pooling_op->padding_right,
       argmax_pooling_op->kernel_width);
-  argmax_pooling_op->output = output;
-
-  size_t valid_batch_size = 0;
-  if (input == argmax_pooling_op->last_input &&
-      input_height == argmax_pooling_op->last_input_height &&
-      input_width == argmax_pooling_op->last_input_width)
-  {
-    valid_batch_size = argmax_pooling_op->valid_batch_size;
-    if (batch_size <= valid_batch_size) {
-      argmax_pooling_op->compute.range[0] = batch_size;
-      argmax_pooling_op->state = xnn_run_state_ready;
-      return xnn_status_success;
-    }
-  }
 
   const size_t pooling_height = argmax_pooling_op->kernel_height;
   const size_t pooling_width = argmax_pooling_op->kernel_width;
@@ -233,17 +219,26 @@
 
   const size_t step_width = pooling_width;
   const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
-  // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
-  const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
 
-  const void** indirection_buffer = (const void**) xnn_reallocate_memory(argmax_pooling_op->indirection_buffer, indirection_buffer_size);
-  if (indirection_buffer == NULL) {
-    xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
-    return xnn_status_out_of_memory;
+  if (input_height != argmax_pooling_op->last_input_height ||
+      input_width != argmax_pooling_op->last_input_width)
+  {
+    // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
+    const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
+
+    const void** indirection_buffer = (const void**) xnn_reallocate_memory(argmax_pooling_op->indirection_buffer, indirection_buffer_size);
+    if (indirection_buffer == NULL) {
+      xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
+      return xnn_status_out_of_memory;
+    }
+    argmax_pooling_op->indirection_buffer = indirection_buffer;
+
+    xnn_indirection_init_maxpool2d(argmax_pooling_op, step_height, step_width, 2 /* log2(sizeof(float)) */);
+
+    argmax_pooling_op->last_input = input;
+    argmax_pooling_op->last_input_height = input_height;
+    argmax_pooling_op->last_input_width = input_width;
   }
-  argmax_pooling_op->indirection_buffer = indirection_buffer;
-
-  xnn_indirection_init_maxpool2d(argmax_pooling_op, valid_batch_size, step_height, step_width, 2 /* log2(sizeof(float)) */);
 
   const size_t channels = argmax_pooling_op->channels;
 
@@ -255,21 +250,22 @@
   const uint32_t qr = ukernel->qr;
   const size_t multipass_adjustment = qr == 0 ? 0 : round_up(pooling_size - mr, qr) + mr - qr;
   argmax_pooling_op->context.argmax_pooling = (struct argmax_pooling_context) {
-      .indirect_input = indirection_buffer,
-      .indirect_input_batch_stride = output_height * indirect_input_height_stride,
-      .indirect_input_height_stride = indirect_input_height_stride,
-      .output = output,
-      .output_batch_stride = output_height * output_height_stride,
-      .output_height_stride = output_height_stride,
-      .output_width = output_width,
-      .index = index,
-      .index_batch_stride = output_height * index_height_stride,
-      .index_height_stride = index_height_stride,
-      .pooling_size = pooling_size,
-      .channels = channels,
-      .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
-      .output_increment = output_width_stride - channels * sizeof(float),
-      .params.f32 = argmax_pooling_op->f32_output_params,
+    .indirect_input = argmax_pooling_op->indirection_buffer,
+    .indirect_input_height_stride = indirect_input_height_stride,
+    .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) argmax_pooling_op->last_input),
+    .input_batch_stride = input_height * input_width * argmax_pooling_op->input_pixel_stride * sizeof(float),
+    .output = output,
+    .output_batch_stride = output_height * output_height_stride,
+    .output_height_stride = output_height_stride,
+    .output_width = output_width,
+    .index = index,
+    .index_batch_stride = output_height * index_height_stride,
+    .index_height_stride = index_height_stride,
+    .pooling_size = pooling_size,
+    .channels = channels,
+    .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
+    .output_increment = output_width_stride - channels * sizeof(float),
+    .params.f32 = argmax_pooling_op->f32_output_params,
   };
   argmax_pooling_op->compute.type = xnn_parallelization_type_2d;
   argmax_pooling_op->compute.range[0] = batch_size;
@@ -284,10 +280,6 @@
   }
   argmax_pooling_op->state = xnn_run_state_ready;
 
-  argmax_pooling_op->last_input = input;
-  argmax_pooling_op->last_input_height = input_height;
-  argmax_pooling_op->last_input_width = input_width;
-  argmax_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
-
   return xnn_status_success;
 }
+
diff --git a/src/f32-argmaxpool/up4-psimd.c b/src/f32-argmaxpool/4x-psimd-c4.c
similarity index 69%
rename from src/f32-argmaxpool/up4-psimd.c
rename to src/f32-argmaxpool/4x-psimd-c4.c
index fe0f1fc..197ff72 100644
--- a/src/f32-argmaxpool/up4-psimd.c
+++ b/src/f32-argmaxpool/4x-psimd-c4.c
@@ -10,44 +10,46 @@
 #include <xnnpack/argmaxpool.h>
 
 
-void xnn_f32_argmaxpool_ukernel_up4__psimd(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_4x__psimd_c4(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     uint32_t* index,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks <= 4);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements <= 4);
+  assert(channels != 0);
 
   const psimd_f32 voutput_max = psimd_load_splat_f32(&params->scalar.max);
   const psimd_f32 voutput_min = psimd_load_splat_f32(&params->scalar.min);
   do {
-    float* o = output;
-    uint32_t* i = index;
-
     const float* i0 = input[0];
     const float* i1 = input[1];
     const float* i2 = input[2];
     const float* i3 = input[3];
-    if (ks < 2) {
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    if (pooling_elements < 2) {
       i1 = i0;
     }
-    if (ks <= 2) {
+    if (pooling_elements <= 2) {
       i2 = i0;
     }
-    if (ks != 4) {
+    if (pooling_elements != 4) {
       i3 = i0;
     }
 
-    size_t k = kc;
-    for (; k >= 4; k -= 4) {
+    size_t c = channels;
+    for (; c >= 4; c -= 4) {
       const psimd_f32 vi0 = psimd_load_f32(i0);
       i0 += 4;
       const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -74,12 +76,12 @@
 
       const psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-      psimd_store_f32(o, vout);
-      o += 4;
-      psimd_store_u32(i, vidx);
-      i += 4;
+      psimd_store_f32(output, vout);
+      output += 4;
+      psimd_store_u32(index, vidx);
+      index += 4;
     }
-    if (k != 0) {
+    if (c != 0) {
       const psimd_f32 vi0 = psimd_load_f32(i0);
       const psimd_f32 vi1 = psimd_load_f32(i1);
       const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -102,23 +104,22 @@
 
       psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-      if (k & 2) {
-        psimd_store2_f32(o, vout);
-        psimd_store2_u32(i, vidx);
+      if (c & 2) {
+        psimd_store2_f32(output, vout);
+        psimd_store2_u32(index, vidx);
         vout = psimd_concat_hi_f32(vout, vout);
         vidx = psimd_concat_hi_u32(vidx, vidx);
-        o += 2;
-        i += 2;
+        output += 2;
+        index += 2;
       }
-      if (k & 1) {
-        psimd_store1_f32(o, vout);
-        psimd_store1_u32(i, vidx);
-        o += 1;
-        i += 1;
+      if (c & 1) {
+        psimd_store1_f32(output, vout);
+        psimd_store1_u32(index, vidx);
+        output += 1;
+        index += 1;
       }
     }
     input = (const float**) ((uintptr_t) input + input_increment);
-    output = (float*) ((uintptr_t) o + output_increment);
-    index = (uint32_t*) i;
-  } while (--n != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/up4-scalar.c b/src/f32-argmaxpool/4x-scalar-c1.c
similarity index 61%
rename from src/f32-argmaxpool/up4-scalar.c
rename to src/f32-argmaxpool/4x-scalar-c1.c
index 8b668b0..999ff5e 100644
--- a/src/f32-argmaxpool/up4-scalar.c
+++ b/src/f32-argmaxpool/4x-scalar-c1.c
@@ -9,43 +9,45 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_argmaxpool_ukernel_up4__scalar(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_4x__scalar_c1(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     uint32_t* index,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks <= 4);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements <= 4);
+  assert(channels != 0);
 
   const float voutput_max = params->scalar.max;
   const float voutput_min = params->scalar.min;
   do {
-    float* o = output;
-    uint32_t* i = index;
-
     const float* i0 = input[0];
     const float* i1 = input[1];
     const float* i2 = input[2];
     const float* i3 = input[3];
-    if (ks < 2) {
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    if (pooling_elements < 2) {
       i1 = i0;
     }
-    if (ks <= 2) {
+    if (pooling_elements <= 2) {
       i2 = i0;
     }
-    if (ks != 4) {
+    if (pooling_elements != 4) {
       i3 = i0;
     }
 
-    size_t k = kc;
+    size_t c = channels;
     do {
       const float vi0 = *i0++;
       const float vi1 = *i1++;
@@ -72,11 +74,10 @@
 
       const float vout = math_max_f32(math_min_f32(vmax, voutput_max), voutput_min);
 
-      *o++ = vout;
-      *i++ = vidx;
-    } while (--k != 0);
+      *output++ = vout;
+      *index++ = vidx;
+    } while (--c != 0);
     input = (const float**) ((uintptr_t) input + input_increment);
-    output = (float*) ((uintptr_t) o + output_increment);
-    index = (uint32_t*) i;
-  } while (--n != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/up4-sse2.c b/src/f32-argmaxpool/4x-sse2-c4.c
similarity index 70%
rename from src/f32-argmaxpool/up4-sse2.c
rename to src/f32-argmaxpool/4x-sse2-c4.c
index 1f0e3cf..14051c8 100644
--- a/src/f32-argmaxpool/up4-sse2.c
+++ b/src/f32-argmaxpool/4x-sse2-c4.c
@@ -10,44 +10,46 @@
 #include <xnnpack/argmaxpool.h>
 
 
-void xnn_f32_argmaxpool_ukernel_up4__sse2(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_4x__sse2_c4(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     uint32_t* index,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks <= 4);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements <= 4);
+  assert(channels != 0);
 
   const __m128 voutput_max = _mm_load_ps(params->sse.max);
   const __m128 voutput_min = _mm_load_ps(params->sse.min);
   do {
-    float* o = output;
-    uint32_t* i = index;
-
     const float* i0 = input[0];
     const float* i1 = input[1];
     const float* i2 = input[2];
     const float* i3 = input[3];
-    if (ks < 2) {
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    if (pooling_elements < 2) {
       i1 = i0;
     }
-    if (ks <= 2) {
+    if (pooling_elements <= 2) {
       i2 = i0;
     }
-    if (ks != 4) {
+    if (pooling_elements != 4) {
       i3 = i0;
     }
 
-    size_t k = kc;
-    for (; k >= 4; k -= 4) {
+    size_t c = channels;
+    for (; c >= 4; c -= 4) {
       const __m128 vi0 = _mm_loadu_ps(i0);
       i0 += 4;
       const __m128 vi1 = _mm_loadu_ps(i1);
@@ -74,12 +76,12 @@
 
       const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
 
-      _mm_storeu_ps(o, vout);
-      o += 4;
-      _mm_storeu_si128((__m128i*) i, vidx);
-      i += 4;
+      _mm_storeu_ps(output, vout);
+      output += 4;
+      _mm_storeu_si128((__m128i*) index, vidx);
+      index += 4;
     }
-    if (k != 0) {
+    if (c != 0) {
       const __m128 vi0 = _mm_loadu_ps(i0);
       const __m128 vi1 = _mm_loadu_ps(i1);
       const __m128 vi2 = _mm_loadu_ps(i2);
@@ -102,23 +104,22 @@
 
       __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
 
-      if (k & 2) {
-        _mm_store_sd((double*) o, _mm_castps_pd(vout));
-        _mm_storel_epi64((__m128i*) i, vidx);
+      if (c & 2) {
+        _mm_store_sd((double*) output, _mm_castps_pd(vout));
+        _mm_storel_epi64((__m128i*) index, vidx);
         vout = _mm_movehl_ps(vout, vout);
         vidx = _mm_unpackhi_epi64(vidx, vidx);
-        o += 2;
-        i += 2;
+        output += 2;
+        index += 2;
       }
-      if (k & 1) {
-        _mm_store_ss(o, vout);
-        *i = (uint32_t) _mm_cvtsi128_si32(vidx);
-        o += 1;
-        i += 1;
+      if (c & 1) {
+        _mm_store_ss(output, vout);
+        *index = (uint32_t) _mm_cvtsi128_si32(vidx);
+        output += 1;
+        index += 1;
       }
     }
     input = (const float**) ((uintptr_t) input + input_increment);
-    output = (float*) ((uintptr_t) o + output_increment);
-    index = (uint32_t*) i;
-  } while (--n != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/mp9p8q-psimd.c b/src/f32-argmaxpool/9p8x-psimd-c4.c
similarity index 82%
rename from src/f32-argmaxpool/mp9p8q-psimd.c
rename to src/f32-argmaxpool/9p8x-psimd-c4.c
index 2e2564a8..74c6cd3 100644
--- a/src/f32-argmaxpool/mp9p8q-psimd.c
+++ b/src/f32-argmaxpool/9p8x-psimd-c4.c
@@ -10,12 +10,13 @@
 #include <xnnpack/argmaxpool.h>
 
 
-void xnn_f32_argmaxpool_ukernel_mp9p8q__psimd(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
-    float* acc_buffer,
+    size_t input_offset,
+    float* accumulation_buffer,
     uint32_t* index_buffer,
     float* output,
     uint32_t* index,
@@ -23,16 +24,16 @@
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks > 9);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements > 9);
+  assert(channels != 0);
 
   const psimd_f32 voutput_max = psimd_load_splat_f32(&params->scalar.max);
   const psimd_f32 voutput_min = psimd_load_splat_f32(&params->scalar.min);
   do {
     {
-      float* ab = acc_buffer;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
 
       const float* i0 = *input++;
@@ -44,8 +45,17 @@
       const float* i6 = *input++;
       const float* i7 = *input++;
       const float* i8 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
 
-      for (size_t k = 0; k < kc; k += 4) {
+      for (size_t c = 0; c < channels; c += 4) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         i0 += 4;
         const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -110,8 +120,8 @@
     const psimd_u32 v8 = psimd_splat_u32(8);
     psimd_u32 vidx0 = psimd_add_u32(v1, v8);
 
-    size_t m = ks;
-    for (m -= 9; m > 8; m -= 8) {
+    size_t k = pooling_elements;
+    for (k -= 9; k > 8; k -= 8) {
       const float* i0 = *input++;
       const float* i1 = *input++;
       const float* i2 = *input++;
@@ -120,11 +130,19 @@
       const float* i5 = *input++;
       const float* i6 = *input++;
       const float* i7 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
 
-      float* ab = acc_buffer;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
 
-      for (size_t k = 0; k < kc; k += 4) {
+      for (size_t c = 0; c < channels; c += 4) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         i0 += 4;
         const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -203,33 +221,41 @@
       const float* i5 = input[5];
       const float* i6 = input[6];
       const float* i7 = input[7];
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
       input = (const float**) ((uintptr_t) input + input_increment);
-      if (m < 2) {
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m != 8) {
+      if (k != 8) {
         i7 = i0;
       }
 
-      size_t k = kc;
-      float* ab = acc_buffer;
+      size_t c = channels;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
-      for (; k >= 4; k -= 4) {
+      for (; c >= 4; c -= 4) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         i0 += 4;
         const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -298,7 +324,7 @@
         psimd_store_u32(i, vidx);
         i += 4;
       }
-      if (k != 0) {
+      if (c != 0) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         const psimd_f32 vi1 = psimd_load_f32(i1);
         const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -352,7 +378,7 @@
 
         psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-        if (k & 2) {
+        if (c & 2) {
           psimd_store2_f32(o, vout);
           psimd_store2_u32(i, vidx);
           vout = psimd_concat_hi_f32(vout, vout);
@@ -360,7 +386,7 @@
           o += 2;
           i += 2;
         }
-        if (k & 1) {
+        if (c & 1) {
           psimd_store1_f32(o, vout);
           psimd_store1_u32(i, vidx);
           o += 1;
@@ -371,5 +397,5 @@
 
     output = (float*) ((uintptr_t) o + output_increment);
     index = (uint32_t*) i;
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/mp9p8q-scalar.c b/src/f32-argmaxpool/9p8x-scalar-c1.c
similarity index 69%
rename from src/f32-argmaxpool/mp9p8q-scalar.c
rename to src/f32-argmaxpool/9p8x-scalar-c1.c
index 0f9f832..10bb965 100644
--- a/src/f32-argmaxpool/mp9p8q-scalar.c
+++ b/src/f32-argmaxpool/9p8x-scalar-c1.c
@@ -9,12 +9,13 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_argmaxpool_ukernel_mp9p8q__scalar(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
-    float* acc_buffer,
+    size_t input_offset,
+    float* accumulation_buffer,
     uint32_t* index_buffer,
     float* output,
     uint32_t* index,
@@ -22,16 +23,16 @@
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks > 9);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements > 9);
+  assert(channels != 0);
 
   const float voutput_max = params->scalar.max;
   const float voutput_min = params->scalar.min;
   do {
     {
-      float* ab = acc_buffer;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
 
       const float* i0 = *input++;
@@ -43,8 +44,17 @@
       const float* i6 = *input++;
       const float* i7 = *input++;
       const float* i8 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
 
-      size_t k = kc;
+      size_t c = channels;
       do {
         const float vi0 = *i0++;
         const float vi1 = *i1++;
@@ -101,11 +111,11 @@
 
         *ab++ = vmax;
         *ib++ = vidx;
-      } while (--k != 0);
+      } while (--c != 0);
     }
     uint32_t vidx0 = 9;
-    size_t m = ks;
-    for (m -= 9; m > 8; m -= 8) {
+    size_t k = pooling_elements;
+    for (k -= 9; k > 8; k -= 8) {
       const float* i0 = *input++;
       const float* i1 = *input++;
       const float* i2 = *input++;
@@ -114,11 +124,19 @@
       const float* i5 = *input++;
       const float* i6 = *input++;
       const float* i7 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
 
-      float* ab = acc_buffer;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
 
-      size_t k = kc;
+      size_t c = channels;
       do {
         const float vi0 = *i0++;
         const float vi1 = *i1++;
@@ -174,7 +192,7 @@
 
         *ab++ = vmax;
         *ib++ = vidx;
-      } while (--k != 0);
+      } while (--c != 0);
       vidx0 += 8;
     }
 
@@ -189,31 +207,39 @@
       const float* i5 = input[5];
       const float* i6 = input[6];
       const float* i7 = input[7];
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
       input = (const float**) ((uintptr_t) input + input_increment);
-      if (m < 2) {
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m != 8) {
+      if (k != 8) {
         i7 = i0;
       }
 
-      size_t k = kc;
-      float* ab = acc_buffer;
+      size_t c = channels;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
       do {
         const float vi0 = *i0++;
@@ -272,10 +298,10 @@
 
         *o++ = vout;
         *i++ = vidx;
-      } while (--k != 0);
+      } while (--c != 0);
     }
 
     output = (float*) ((uintptr_t) o + output_increment);
     index = (uint32_t*) i;
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/mp9p8q-sse2.c b/src/f32-argmaxpool/9p8x-sse2-c4.c
similarity index 84%
rename from src/f32-argmaxpool/mp9p8q-sse2.c
rename to src/f32-argmaxpool/9p8x-sse2-c4.c
index 31b55bf..30a3443 100644
--- a/src/f32-argmaxpool/mp9p8q-sse2.c
+++ b/src/f32-argmaxpool/9p8x-sse2-c4.c
@@ -10,12 +10,13 @@
 #include <xnnpack/argmaxpool.h>
 
 
-void xnn_f32_argmaxpool_ukernel_mp9p8q__sse2(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
-    float* acc_buffer,
+    size_t input_offset,
+    float* accumulation_buffer,
     uint32_t* index_buffer,
     float* output,
     uint32_t* index,
@@ -23,16 +24,16 @@
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks > 9);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements > 9);
+  assert(channels != 0);
 
   const __m128 voutput_max = _mm_load_ps(params->sse.max);
   const __m128 voutput_min = _mm_load_ps(params->sse.min);
   do {
     {
-      float* ab = acc_buffer;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
 
       const float* i0 = *input++;
@@ -44,8 +45,17 @@
       const float* i6 = *input++;
       const float* i7 = *input++;
       const float* i8 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
 
-      for (size_t k = 0; k < kc; k += 4) {
+      for (size_t c = 0; c < channels; c += 4) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         i0 += 4;
         const __m128 vi1 = _mm_loadu_ps(i1);
@@ -110,8 +120,8 @@
     const __m128i v8 = _mm_set1_epi32(8);
     __m128i vidx0 = _mm_add_epi32(v1, v8);
 
-    size_t m = ks;
-    for (m -= 9; m > 8; m -= 8) {
+    size_t k = pooling_elements;
+    for (k -= 9; k > 8; k -= 8) {
       const float* i0 = *input++;
       const float* i1 = *input++;
       const float* i2 = *input++;
@@ -120,11 +130,19 @@
       const float* i5 = *input++;
       const float* i6 = *input++;
       const float* i7 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
 
-      float* ab = acc_buffer;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
 
-      for (size_t k = 0; k < kc; k += 4) {
+      for (size_t c = 0; c < channels; c += 4) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         i0 += 4;
         const __m128 vi1 = _mm_loadu_ps(i1);
@@ -203,33 +221,41 @@
       const float* i5 = input[5];
       const float* i6 = input[6];
       const float* i7 = input[7];
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
       input = (const float**) ((uintptr_t) input + input_increment);
-      if (m < 2) {
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m != 8) {
+      if (k != 8) {
         i7 = i0;
       }
 
-      size_t k = kc;
-      float* ab = acc_buffer;
+      size_t c = channels;
+      float* ab = accumulation_buffer;
       uint32_t* ib = index_buffer;
-      for (; k >= 4; k -= 4) {
+      for (; c >= 4; c -= 4) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         i0 += 4;
         const __m128 vi1 = _mm_loadu_ps(i1);
@@ -298,7 +324,7 @@
         _mm_storeu_si128((__m128i*) i, vidx);
         i += 4;
       }
-      if (k != 0) {
+      if (c != 0) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         const __m128 vi1 = _mm_loadu_ps(i1);
         const __m128 vi2 = _mm_loadu_ps(i2);
@@ -352,7 +378,7 @@
 
         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
 
-        if (k & 2) {
+        if (c & 2) {
           _mm_store_sd((double*) o, _mm_castps_pd(vout));
           _mm_storel_epi64((__m128i*) i, vidx);
           vout = _mm_movehl_ps(vout, vout);
@@ -360,7 +386,7 @@
           o += 2;
           i += 2;
         }
-        if (k & 1) {
+        if (c & 1) {
           _mm_store_ss(o, vout);
           *i = (uint32_t) _mm_cvtsi128_si32(vidx);
           o += 1;
@@ -371,5 +397,5 @@
 
     output = (float*) ((uintptr_t) o + output_increment);
     index = (uint32_t*) i;
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/up9-psimd.c b/src/f32-argmaxpool/9x-psimd-c4.c
similarity index 75%
rename from src/f32-argmaxpool/up9-psimd.c
rename to src/f32-argmaxpool/9x-psimd-c4.c
index 73d7931..4b642d5 100644
--- a/src/f32-argmaxpool/up9-psimd.c
+++ b/src/f32-argmaxpool/9x-psimd-c4.c
@@ -10,28 +10,26 @@
 #include <xnnpack/argmaxpool.h>
 
 
-void xnn_f32_argmaxpool_ukernel_up9__psimd(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_9x__psimd_c4(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     uint32_t* index,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks <= 9);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements <= 9);
+  assert(channels != 0);
 
   const psimd_f32 voutput_max = psimd_load_splat_f32(&params->scalar.max);
   const psimd_f32 voutput_min = psimd_load_splat_f32(&params->scalar.min);
   do {
-    float* o = output;
-    uint32_t* i = index;
-
     const float* i0 = input[0];
     const float* i1 = input[1];
     const float* i2 = input[2];
@@ -41,33 +39,42 @@
     const float* i6 = input[6];
     const float* i7 = input[7];
     const float* i8 = input[8];
-    if (ks < 2) {
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    i4 = (const float*) ((uintptr_t) i4 + input_offset);
+    i5 = (const float*) ((uintptr_t) i5 + input_offset);
+    i6 = (const float*) ((uintptr_t) i6 + input_offset);
+    i7 = (const float*) ((uintptr_t) i7 + input_offset);
+    i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    if (pooling_elements < 2) {
       i1 = i0;
     }
-    if (ks <= 2) {
+    if (pooling_elements <= 2) {
       i2 = i0;
     }
-    if (ks < 4) {
+    if (pooling_elements < 4) {
       i3 = i0;
     }
-    if (ks <= 4) {
+    if (pooling_elements <= 4) {
       i4 = i0;
     }
-    if (ks < 6) {
+    if (pooling_elements < 6) {
       i5 = i0;
     }
-    if (ks <= 6) {
+    if (pooling_elements <= 6) {
       i6 = i0;
     }
-    if (ks < 8) {
+    if (pooling_elements < 8) {
       i7 = i0;
     }
-    if (ks <= 8) {
+    if (pooling_elements <= 8) {
       i8 = i0;
     }
 
-    size_t k = kc;
-    for (; k >= 4; k -= 4) {
+    size_t c = channels;
+    for (; c >= 4; c -= 4) {
       const psimd_f32 vi0 = psimd_load_f32(i0);
       i0 += 4;
       const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -124,12 +131,12 @@
 
       const psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-      psimd_store_f32(o, vout);
-      o += 4;
-      psimd_store_u32(i, vidx);
-      i += 4;
+      psimd_store_f32(output, vout);
+      output += 4;
+      psimd_store_u32(index, vidx);
+      index += 4;
     }
-    if (k != 0) {
+    if (c != 0) {
       const psimd_f32 vi0 = psimd_load_f32(i0);
       const psimd_f32 vi1 = psimd_load_f32(i1);
       const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -177,23 +184,22 @@
 
       psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-      if (k & 2) {
-        psimd_store2_f32(o, vout);
-        psimd_store2_u32(i, vidx);
+      if (c & 2) {
+        psimd_store2_f32(output, vout);
+        psimd_store2_u32(index, vidx);
         vout = psimd_concat_hi_f32(vout, vout);
         vidx = psimd_concat_hi_u32(vidx, vidx);
-        o += 2;
-        i += 2;
+        output += 2;
+        index += 2;
       }
-      if (k & 1) {
-        psimd_store1_f32(o, vout);
-        psimd_store1_u32(i, vidx);
-        o += 1;
-        i += 1;
+      if (c & 1) {
+        psimd_store1_f32(output, vout);
+        psimd_store1_u32(index, vidx);
+        output += 1;
+        index += 1;
       }
     }
     input = (const float**) ((uintptr_t) input + input_increment);
-    output = (float*) ((uintptr_t) o + output_increment);
-    index = (uint32_t*) i;
-  } while (--n != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/up9-scalar.c b/src/f32-argmaxpool/9x-scalar-c1.c
similarity index 62%
rename from src/f32-argmaxpool/up9-scalar.c
rename to src/f32-argmaxpool/9x-scalar-c1.c
index 08b6fa0..7324e39 100644
--- a/src/f32-argmaxpool/up9-scalar.c
+++ b/src/f32-argmaxpool/9x-scalar-c1.c
@@ -9,28 +9,26 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_argmaxpool_ukernel_up9__scalar(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_9x__scalar_c1(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     uint32_t* index,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks <= 9);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements <= 9);
+  assert(channels != 0);
 
   const float voutput_max = params->scalar.max;
   const float voutput_min = params->scalar.min;
   do {
-    float* o = output;
-    uint32_t* i = index;
-
     const float* i0 = input[0];
     const float* i1 = input[1];
     const float* i2 = input[2];
@@ -40,32 +38,41 @@
     const float* i6 = input[6];
     const float* i7 = input[7];
     const float* i8 = input[8];
-    if (ks < 2) {
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    i4 = (const float*) ((uintptr_t) i4 + input_offset);
+    i5 = (const float*) ((uintptr_t) i5 + input_offset);
+    i6 = (const float*) ((uintptr_t) i6 + input_offset);
+    i7 = (const float*) ((uintptr_t) i7 + input_offset);
+    i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    if (pooling_elements < 2) {
       i1 = i0;
     }
-    if (ks <= 2) {
+    if (pooling_elements <= 2) {
       i2 = i0;
     }
-    if (ks < 4) {
+    if (pooling_elements < 4) {
       i3 = i0;
     }
-    if (ks <= 4) {
+    if (pooling_elements <= 4) {
       i4 = i0;
     }
-    if (ks < 6) {
+    if (pooling_elements < 6) {
       i5 = i0;
     }
-    if (ks <= 6) {
+    if (pooling_elements <= 6) {
       i6 = i0;
     }
-    if (ks < 8) {
+    if (pooling_elements < 8) {
       i7 = i0;
     }
-    if (ks <= 8) {
+    if (pooling_elements <= 8) {
       i8 = i0;
     }
 
-    size_t k = kc;
+    size_t c = channels;
     do {
       const float vi0 = *i0++;
       const float vi1 = *i1++;
@@ -122,11 +129,10 @@
 
       const float vout = math_max_f32(math_min_f32(vmax, voutput_max), voutput_min);
 
-      *o++ = vout;
-      *i++ = vidx;
-    } while (--k != 0);
+      *output++ = vout;
+      *index++ = vidx;
+    } while (--c != 0);
     input = (const float**) ((uintptr_t) input + input_increment);
-    output = (float*) ((uintptr_t) o + output_increment);
-    index = (uint32_t*) i;
-  } while (--n != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-argmaxpool/up9-sse2.c b/src/f32-argmaxpool/9x-sse2-c4.c
similarity index 83%
rename from src/f32-argmaxpool/up9-sse2.c
rename to src/f32-argmaxpool/9x-sse2-c4.c
index acd8609..c0d2075 100644
--- a/src/f32-argmaxpool/up9-sse2.c
+++ b/src/f32-argmaxpool/9x-sse2-c4.c
@@ -10,21 +10,22 @@
 #include <xnnpack/argmaxpool.h>
 
 
-void xnn_f32_argmaxpool_ukernel_up9__sse2(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_argmaxpool_ukernel_9x__sse2_c4(
+    size_t output_pixels,
+    size_t pooling_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     uint32_t* index,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(ks <= 9);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(pooling_elements != 0);
+  assert(pooling_elements <= 9);
+  assert(channels != 0);
 
   const __m128 voutput_max = _mm_load_ps(params->sse.max);
   const __m128 voutput_min = _mm_load_ps(params->sse.min);
@@ -41,33 +42,42 @@
     const float* i6 = input[6];
     const float* i7 = input[7];
     const float* i8 = input[8];
-    if (ks < 2) {
+    i0 = (const float*) ((uintptr_t) i0 + input_offset);
+    i1 = (const float*) ((uintptr_t) i1 + input_offset);
+    i2 = (const float*) ((uintptr_t) i2 + input_offset);
+    i3 = (const float*) ((uintptr_t) i3 + input_offset);
+    i4 = (const float*) ((uintptr_t) i4 + input_offset);
+    i5 = (const float*) ((uintptr_t) i5 + input_offset);
+    i6 = (const float*) ((uintptr_t) i6 + input_offset);
+    i7 = (const float*) ((uintptr_t) i7 + input_offset);
+    i8 = (const float*) ((uintptr_t) i8 + input_offset);
+    if (pooling_elements < 2) {
       i1 = i0;
     }
-    if (ks <= 2) {
+    if (pooling_elements <= 2) {
       i2 = i0;
     }
-    if (ks < 4) {
+    if (pooling_elements < 4) {
       i3 = i0;
     }
-    if (ks <= 4) {
+    if (pooling_elements <= 4) {
       i4 = i0;
     }
-    if (ks < 6) {
+    if (pooling_elements < 6) {
       i5 = i0;
     }
-    if (ks <= 6) {
+    if (pooling_elements <= 6) {
       i6 = i0;
     }
-    if (ks < 8) {
+    if (pooling_elements < 8) {
       i7 = i0;
     }
-    if (ks <= 8) {
+    if (pooling_elements <= 8) {
       i8 = i0;
     }
 
-    size_t k = kc;
-    for (; k >= 4; k -= 4) {
+    size_t c = channels;
+    for (; c >= 4; c -= 4) {
       const __m128 vi0 = _mm_loadu_ps(i0);
       i0 += 4;
       const __m128 vi1 = _mm_loadu_ps(i1);
@@ -129,7 +139,7 @@
       _mm_storeu_si128((__m128i*) i, vidx);
       i += 4;
     }
-    if (k != 0) {
+    if (c != 0) {
       const __m128 vi0 = _mm_loadu_ps(i0);
       const __m128 vi1 = _mm_loadu_ps(i1);
       const __m128 vi2 = _mm_loadu_ps(i2);
@@ -177,7 +187,7 @@
 
       __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
 
-      if (k & 2) {
+      if (c & 2) {
         _mm_store_sd((double*) o, _mm_castps_pd(vout));
         _mm_storel_epi64((__m128i*) i, vidx);
         vout = _mm_movehl_ps(vout, vout);
@@ -185,7 +195,7 @@
         o += 2;
         i += 2;
       }
-      if (k & 1) {
+      if (c & 1) {
         _mm_store_ss(o, vout);
         *i = (uint32_t) _mm_cvtsi128_si32(vidx);
         o += 1;
@@ -195,5 +205,5 @@
     input = (const float**) ((uintptr_t) input + input_increment);
     output = (float*) ((uintptr_t) o + output_increment);
     index = (uint32_t*) i;
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-maxpool/9p8q-psimd.c b/src/f32-maxpool/9p8x-psimd-c4.c
similarity index 76%
rename from src/f32-maxpool/9p8q-psimd.c
rename to src/f32-maxpool/9p8x-psimd-c4.c
index c973db3..a502c90 100644
--- a/src/f32-maxpool/9p8q-psimd.c
+++ b/src/f32-maxpool/9p8x-psimd-c4.c
@@ -10,19 +10,20 @@
 #include <xnnpack/maxpool.h>
 
 
-void xnn_f32_maxpool_ukernel_9p8q__psimd(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_maxpool_ukernel_9p8x__psimd_c4(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
 
   const psimd_f32 voutput_max = psimd_load_splat_f32(&params->scalar.max);
   const psimd_f32 voutput_min = psimd_load_splat_f32(&params->scalar.min);
@@ -38,33 +39,42 @@
       const float* i6 = *input++;
       const float* i7 = *input++;
       const float* i8 = *input++;
-      if (ks < 2) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
         i1 = i0;
       }
-      if (ks <= 2) {
+      if (kernel_elements <= 2) {
         i2 = i0;
       }
-      if (ks < 4) {
+      if (kernel_elements < 4) {
         i3 = i0;
       }
-      if (ks <= 4) {
+      if (kernel_elements <= 4) {
         i4 = i0;
       }
-      if (ks < 6) {
+      if (kernel_elements < 6) {
         i5 = i0;
       }
-      if (ks <= 6) {
+      if (kernel_elements <= 6) {
         i6 = i0;
       }
-      if (ks < 8) {
+      if (kernel_elements < 8) {
         i7 = i0;
       }
-      if (ks <= 8) {
+      if (kernel_elements <= 8) {
         i8 = i0;
       }
 
-      size_t k = kc;
-      for (; k >= 4; k -= 4) {
+      size_t c = channels;
+      for (; c >= 4; c -= 4) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         i0 += 4;
         const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -97,7 +107,7 @@
         psimd_store_f32(o, vout);
         o += 4;
       }
-      if (k != 0) {
+      if (c != 0) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         i0 += 4;
         const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -127,19 +137,19 @@
         const psimd_f32 vmax = psimd_max_f32(vmax2345, vmax01678);
         psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-        if (k & 2) {
+        if (c & 2) {
           psimd_store2_f32(o, vout);
           vout = psimd_concat_hi_f32(vout, vout);
           o += 2;
         }
-        if (k & 1) {
+        if (c & 1) {
           psimd_store1_f32(o, vout);
           o += 1;
         }
       }
     }
 
-    for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
       const float* i0 = *input++;
       const float* i1 = *input++;
       const float* i2 = *input++;
@@ -148,31 +158,39 @@
       const float* i5 = *input++;
       const float* i6 = *input++;
       const float* i7 = *input++;
-      if (m < 2) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m < 8) {
+      if (k < 8) {
         i7 = i0;
       }
 
       o = output;
-      size_t k = kc;
-      for (; k >= 4; k -= 4) {
+      size_t c = channels;
+      for (; c >= 4; c -= 4) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         i0 += 4;
         const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -204,7 +222,7 @@
         psimd_store_f32(o, vout);
         o += 4;
       }
-      if (k != 0) {
+      if (c != 0) {
         const psimd_f32 vi0 = psimd_load_f32(i0);
         const psimd_f32 vi1 = psimd_load_f32(i1);
         const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -225,12 +243,12 @@
         const psimd_f32 vmax = psimd_max_f32(vmax2345, vmax0167);
         psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
 
-        if (k & 2) {
+        if (c & 2) {
           psimd_store2_f32(o, vout);
           vout = psimd_concat_hi_f32(vout, vout);
           o += 2;
         }
-        if (k & 1) {
+        if (c & 1) {
           psimd_store1_f32(o, vout);
           o += 1;
         }
@@ -238,5 +256,5 @@
     }
     input = (const float**) ((uintptr_t) input + input_increment);
     output = (float*) ((uintptr_t) o + output_increment);
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-maxpool/9p8q-scalar.c b/src/f32-maxpool/9p8x-scalar-c1.c
similarity index 64%
rename from src/f32-maxpool/9p8q-scalar.c
rename to src/f32-maxpool/9p8x-scalar-c1.c
index 1108170..fec4483 100644
--- a/src/f32-maxpool/9p8q-scalar.c
+++ b/src/f32-maxpool/9p8x-scalar-c1.c
@@ -9,19 +9,20 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_maxpool_ukernel_9p8q__scalar(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_maxpool_ukernel_9p8x__scalar_c1(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
 
   const float voutput_min = params->scalar.min;
   const float voutput_max = params->scalar.max;
@@ -37,32 +38,41 @@
       const float* i6 = *input++;
       const float* i7 = *input++;
       const float* i8 = *input++;
-      if (ks < 2) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
         i1 = i0;
       }
-      if (ks <= 2) {
+      if (kernel_elements <= 2) {
         i2 = i0;
       }
-      if (ks < 4) {
+      if (kernel_elements < 4) {
         i3 = i0;
       }
-      if (ks <= 4) {
+      if (kernel_elements <= 4) {
         i4 = i0;
       }
-      if (ks < 6) {
+      if (kernel_elements < 6) {
         i5 = i0;
       }
-      if (ks <= 6) {
+      if (kernel_elements <= 6) {
         i6 = i0;
       }
-      if (ks < 8) {
+      if (kernel_elements < 8) {
         i7 = i0;
       }
-      if (ks <= 8) {
+      if (kernel_elements <= 8) {
         i8 = i0;
       }
 
-      size_t k = kc;
+      size_t c = channels;
       do {
         const float vi0 = *i0++;
         const float vi1 = *i1++;
@@ -87,10 +97,10 @@
         vout = math_min_f32(vout, voutput_max);
 
         *o++ = vout;
-      } while (--k != 0);
+      } while (--c != 0);
     }
 
-    for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
       const float* i0 = *input++;
       const float* i1 = *input++;
       const float* i2 = *input++;
@@ -99,30 +109,38 @@
       const float* i5 = *input++;
       const float* i6 = *input++;
       const float* i7 = *input++;
-      if (m < 2) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m < 8) {
+      if (k < 8) {
         i7 = i0;
       }
 
       o = output;
-      size_t k = kc;
+      size_t c = channels;
       do {
         const float vi0 = *i0++;
         const float vi1 = *i1++;
@@ -147,9 +165,9 @@
         vout = math_min_f32(vout, voutput_max);
 
         *o++ = vout;
-      } while (--k != 0);
+      } while (--c != 0);
     }
     input = (const float**) ((uintptr_t) input + input_increment);
     output = (float*) ((uintptr_t) o + output_increment);
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/f32-maxpool/9p8q-sse.c b/src/f32-maxpool/9p8x-sse-c4.c
similarity index 75%
rename from src/f32-maxpool/9p8q-sse.c
rename to src/f32-maxpool/9p8x-sse-c4.c
index ecf3f76..54fb78d 100644
--- a/src/f32-maxpool/9p8q-sse.c
+++ b/src/f32-maxpool/9p8x-sse-c4.c
@@ -10,19 +10,20 @@
 #include <xnnpack/maxpool.h>
 
 
-void xnn_f32_maxpool_ukernel_9p8q__sse(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_f32_maxpool_ukernel_9p8x__sse_c4(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
     const float** input,
+    size_t input_offset,
     float* output,
     size_t input_increment,
     size_t output_increment,
     const union xnn_f32_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
 
   const __m128 voutput_max = _mm_load_ps(params->sse.max);
   const __m128 voutput_min = _mm_load_ps(params->sse.min);
@@ -38,33 +39,42 @@
       const float* i6 = *input++;
       const float* i7 = *input++;
       const float* i8 = *input++;
-      if (ks < 2) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
         i1 = i0;
       }
-      if (ks <= 2) {
+      if (kernel_elements <= 2) {
         i2 = i0;
       }
-      if (ks < 4) {
+      if (kernel_elements < 4) {
         i3 = i0;
       }
-      if (ks <= 4) {
+      if (kernel_elements <= 4) {
         i4 = i0;
       }
-      if (ks < 6) {
+      if (kernel_elements < 6) {
         i5 = i0;
       }
-      if (ks <= 6) {
+      if (kernel_elements <= 6) {
         i6 = i0;
       }
-      if (ks < 8) {
+      if (kernel_elements < 8) {
         i7 = i0;
       }
-      if (ks <= 8) {
+      if (kernel_elements <= 8) {
         i8 = i0;
       }
 
-      size_t k = kc;
-      for (; k >= 4; k -= 4) {
+      size_t c = channels;
+      for (; c >= 4; c -= 4) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         i0 += 4;
         const __m128 vi1 = _mm_loadu_ps(i1);
@@ -97,7 +107,7 @@
         _mm_storeu_ps(o, vout);
         o += 4;
       }
-      if (k != 0) {
+      if (c != 0) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         i0 += 4;
         const __m128 vi1 = _mm_loadu_ps(i1);
@@ -127,19 +137,19 @@
         const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
 
-        if (k & 2) {
+        if (c & 2) {
           _mm_storel_pi((__m64*) o, vout);
           o += 2;
           vout = _mm_movehl_ps(vout, vout);
         }
-        if (k & 1) {
+        if (c & 1) {
           _mm_store_ss(o, vout);
           o += 1;
         }
       }
     }
 
-    for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
       const float* i0 = *input++;
       const float* i1 = *input++;
       const float* i2 = *input++;
@@ -148,31 +158,39 @@
       const float* i5 = *input++;
       const float* i6 = *input++;
       const float* i7 = *input++;
-      if (m < 2) {
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m < 8) {
+      if (k < 8) {
         i7 = i0;
       }
 
       o = output;
-      size_t k = kc;
-      for (; k >= 4; k -= 4) {
+      size_t c = channels;
+      for (; c >= 4; c -= 4) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         i0 += 4;
         const __m128 vi1 = _mm_loadu_ps(i1);
@@ -204,7 +222,7 @@
         _mm_storeu_ps(o, vout);
         o += 4;
       }
-      if (k != 0) {
+      if (c != 0) {
         const __m128 vi0 = _mm_loadu_ps(i0);
         const __m128 vi1 = _mm_loadu_ps(i1);
         const __m128 vi2 = _mm_loadu_ps(i2);
@@ -225,12 +243,12 @@
         const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
         __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
 
-        if (k & 2) {
+        if (c & 2) {
           _mm_storel_pi((__m64*) o, vout);
           o += 2;
           vout = _mm_movehl_ps(vout, vout);
         }
-        if (k & 1) {
+        if (c & 1) {
           _mm_store_ss(o, vout);
           o += 1;
         }
@@ -238,5 +256,5 @@
     }
     input = (const float**) ((uintptr_t) input + input_increment);
     output = (float*) ((uintptr_t) o + output_increment);
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/indirection.c b/src/indirection.c
index 1244eb6..a29e5c1 100644
--- a/src/indirection.c
+++ b/src/indirection.c
@@ -252,7 +252,6 @@
 
 void xnn_indirection_init_maxpool2d(
   xnn_operator_t op,
-  size_t batch_start,
   size_t step_height,
   size_t step_width,
   uint32_t log2_element_size)
@@ -260,7 +259,6 @@
   const void** indirection_buffer = op->indirection_buffer;
   const void* input               = op->input;
   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
-  const size_t batch_size         = op->batch_size;
   const size_t input_height       = op->input_height;
   const size_t input_width        = op->input_width;
   const size_t output_height      = op->output_height;
@@ -274,18 +272,16 @@
   const size_t input_padding_top  = op->padding_top;
   const size_t input_padding_left = op->padding_left;
 
-  for (size_t image = batch_start; image < batch_size; image++) {
-    for (size_t output_y = 0; output_y < output_height; output_y++) {
-      for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
-        const size_t input_y = doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top);
-        const size_t clamped_input_y = min(input_y, input_height - 1);
-        for (size_t output_x = 0; output_x < output_width; output_x++) {
-          for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
-            const size_t input_x = doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left);
-            const size_t clamped_input_x = min(input_x, input_width - 1);
-            const size_t index = (image * output_height + output_y) * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
-            indirection_buffer[index] = input + ((image * input_height + clamped_input_y) * input_width + clamped_input_x) * input_pixel_stride;
-          }
+  for (size_t output_y = 0; output_y < output_height; output_y++) {
+    for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
+      const size_t input_y = doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top);
+      const size_t clamped_input_y = min(input_y, input_height - 1);
+      for (size_t output_x = 0; output_x < output_width; output_x++) {
+        for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
+          const size_t input_x = doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left);
+          const size_t clamped_input_x = min(input_x, input_width - 1);
+          const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
+          indirection_buffer[index] = input + (clamped_input_y * input_width + clamped_input_x) * input_pixel_stride;
         }
       }
     }
diff --git a/src/init.c b/src/init.c
index dbcae34..083b978 100644
--- a/src/init.c
+++ b/src/init.c
@@ -109,7 +109,7 @@
   /**************************** U8 micro-kernels ****************************/
   #ifndef XNN_NO_U8_OPERATORS
     xnn_params.u8.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__neon,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__neon_c16,
       .mr = 9,
       .qr = 8,
     };
@@ -178,20 +178,20 @@
       .mr = 7,
     };
     xnn_params.f32.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__psimd,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__psimd,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
       .mr = 4,
     };
     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__psimd,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
       .mr = 9,
     };
     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
-      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__psimd,
+      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
       .mr = 9,
       .qr = 8,
     };
@@ -268,7 +268,7 @@
   /**************************** U8 micro-kernels ****************************/
   #ifndef XNN_NO_U8_OPERATORS
     xnn_params.u8.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__neon,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__neon_c16,
       .mr = 9,
       .qr = 8,
     };
@@ -440,20 +440,20 @@
       .mr = 7,
     };
     xnn_params.f32.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__psimd,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__psimd,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
       .mr = 4,
     };
     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__psimd,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
       .mr = 9,
     };
     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
-      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__psimd,
+      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
       .mr = 9,
       .qr = 8,
     };
@@ -588,7 +588,7 @@
   /**************************** U8 micro-kernels ****************************/
   #ifndef XNN_NO_U8_OPERATORS
     xnn_params.u8.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__sse2,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__sse2_c16,
       .mr = 9,
       .qr = 8,
     };
@@ -658,20 +658,20 @@
       .mr = 7,
     };
     xnn_params.f32.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__sse,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__sse_c4,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__sse2,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
       .mr = 4,
     };
     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__sse2,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
       .mr = 9,
     };
     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
-      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__sse2,
+      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
       .mr = 9,
       .qr = 8,
     };
@@ -778,7 +778,7 @@
   /**************************** U8 micro-kernels ****************************/
   #ifndef XNN_NO_U8_OPERATORS
     xnn_params.u8.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__scalar,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__scalar_c1,
       .mr = 9,
       .qr = 8,
     };
@@ -860,20 +860,20 @@
       .mr = 7,
     };
     xnn_params.f32.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__psimd,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__psimd,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
       .mr = 4,
     };
     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__psimd,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
       .mr = 9,
     };
     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
-      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__psimd,
+      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
       .mr = 9,
       .qr = 8,
     };
@@ -956,7 +956,7 @@
   /**************************** U8 micro-kernels ****************************/
   #ifndef XNN_NO_U8_OPERATORS
     xnn_params.u8.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__scalar,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__scalar_c1,
       .mr = 9,
       .qr = 8,
     };
@@ -1036,20 +1036,20 @@
       .mr = 7,
     };
     xnn_params.f32.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__scalar,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__scalar_c1,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__scalar,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
       .mr = 4,
     };
     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
-      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__scalar,
+      .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
       .mr = 9,
     };
     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
-      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__scalar,
+      .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
       .mr = 9,
       .qr = 8,
     };
diff --git a/src/max-pooling-nhwc.c b/src/max-pooling-nhwc.c
index 8f13af6..46ceae4 100644
--- a/src/max-pooling-nhwc.c
+++ b/src/max-pooling-nhwc.c
@@ -305,140 +305,19 @@
   return status;
 }
 
-enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
-    xnn_operator_t max_pooling_op,
-    size_t batch_size,
-    size_t input_height,
-    size_t input_width,
-    const uint8_t* input,
-    uint8_t* output,
-    pthreadpool_t threadpool)
+static enum xnn_status setup_max_pooling2d(
+  xnn_operator_t max_pooling_op,
+  size_t batch_size,
+  size_t input_height,
+  size_t input_width,
+  const void* input,
+  void* output,
+  uint32_t log2_input_element_size,
+  uint32_t log2_output_element_size,
+  struct maxpool_parameters maxpool[restrict static 1],
+  const void* params,
+  size_t num_threads)
 {
-  if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_u8) {
-    xnn_log_error("failed to setup Max Pooling (NHWC, U8) operator: operator type mismatch");
-    return xnn_status_invalid_parameter;
-  }
-  max_pooling_op->state = xnn_run_state_invalid;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to setup Max Pooling operator: XNNPACK is not initialized");
-    return xnn_status_uninitialized;
-  }
-
-  if (input_width == 0 || input_height == 0) {
-    xnn_log_error(
-      "failed to setup Max Pooling operator with %zux%zu input: input dimensions must be non-zero",
-      input_width, input_height);
-    return xnn_status_invalid_parameter;
-  }
-
-  if (batch_size == 0) {
-    max_pooling_op->state = xnn_run_state_skip;
-    return xnn_status_success;
-  }
-
-  max_pooling_op->batch_size = batch_size;
-  max_pooling_op->input_height = input_height;
-  max_pooling_op->input_width = input_width;
-  max_pooling_op->input = input;
-
-  max_pooling_op->output_height = compute_output_dimension(
-      max_pooling_op->padding_top + input_height + max_pooling_op->padding_bottom,
-      max_pooling_op->kernel_height,
-      max_pooling_op->dilation_height,
-      max_pooling_op->stride_height);
-  max_pooling_op->output_width = compute_output_dimension(
-      max_pooling_op->padding_left + input_width + max_pooling_op->padding_right,
-      max_pooling_op->kernel_width,
-      max_pooling_op->dilation_width,
-      max_pooling_op->stride_width);
-  max_pooling_op->output = output;
-
-  size_t valid_batch_size = 0;
-  if (input == max_pooling_op->last_input &&
-      input_height == max_pooling_op->last_input_height &&
-      input_width == max_pooling_op->last_input_width)
-  {
-    valid_batch_size = max_pooling_op->valid_batch_size;
-    if (batch_size <= valid_batch_size) {
-      max_pooling_op->compute.range[0] = batch_size;
-      max_pooling_op->state = xnn_run_state_ready;
-      return xnn_status_success;
-    }
-  }
-
-  const size_t pooling_height = max_pooling_op->kernel_height;
-  const size_t pooling_width = max_pooling_op->kernel_width;
-  const size_t pooling_size = pooling_height * pooling_width;
-  const size_t output_height = max_pooling_op->output_height;
-  const size_t output_width = max_pooling_op->output_width;
-  // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
-  const uint32_t mr = xnn_params.u8.maxpool.mr;
-
-  const size_t step_width =
-    max_pooling_op->dilation_width > 1 ? pooling_width : min(max_pooling_op->stride_width, pooling_width);
-  const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
-  const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
-
-  const void** indirection_buffer = (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
-  if (indirection_buffer == NULL) {
-    xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
-    return xnn_status_out_of_memory;
-  }
-  max_pooling_op->indirection_buffer = indirection_buffer;
-
-  xnn_indirection_init_maxpool2d(max_pooling_op, valid_batch_size, step_height, step_width, 0 /* log2(sizeof(uint8_t)) */);
-
-  const uint32_t qr = xnn_params.u8.maxpool.qr;
-  const size_t channels = max_pooling_op->channels;
-
-  const size_t indirect_input_height_stride = step_height * sizeof(void*);
-  const size_t output_width_stride = max_pooling_op->output_pixel_stride * sizeof(uint8_t);
-  const size_t output_height_stride = output_width * output_width_stride;
-  const size_t multipass_adjustment = round_up(doz(pooling_size, mr), qr) + mr;
-
-  max_pooling_op->context.max_pooling = (struct max_pooling_context) {
-      .indirect_input = indirection_buffer,
-      .indirect_input_batch_stride = output_height * indirect_input_height_stride,
-      .indirect_input_height_stride = indirect_input_height_stride,
-      .output = output,
-      .output_batch_stride = output_height * output_height_stride,
-      .output_height_stride = output_height_stride,
-      .output_width = output_width,
-      .pooling_size = pooling_size,
-      .channels = channels,
-      .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
-      .output_increment = output_width_stride - channels * sizeof(uint8_t),
-      .params.u8 = max_pooling_op->u8_output_params,
-      .ukernel = xnn_params.u8.maxpool.ukernel,
-  };
-  max_pooling_op->compute.type = xnn_parallelization_type_2d;
-  max_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_max_pooling;
-  max_pooling_op->compute.range[0] = batch_size;
-  max_pooling_op->compute.range[1] = output_height;
-  max_pooling_op->state = xnn_run_state_ready;
-
-  max_pooling_op->last_input = input;
-  max_pooling_op->last_input_height = input_height;
-  max_pooling_op->last_input_width = input_width;
-  max_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
-
-  return xnn_status_success;
-}
-
-enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
-    xnn_operator_t max_pooling_op,
-    size_t batch_size,
-    size_t input_height,
-    size_t input_width,
-    const float* input,
-    float* output,
-    pthreadpool_t threadpool)
-{
-  if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f32) {
-    xnn_log_error("failed to setup Max Pooling (NHWC, F32) operator: operator type mismatch");
-    return xnn_status_invalid_parameter;
-  }
   max_pooling_op->state = xnn_run_state_invalid;
 
   if (!xnn_params.initialized) {
@@ -459,7 +338,6 @@
     return xnn_status_success;
   }
 
-  max_pooling_op->batch_size = batch_size;
   max_pooling_op->input_height = input_height;
   max_pooling_op->input_width = input_width;
   max_pooling_op->input = input;
@@ -474,76 +352,118 @@
       max_pooling_op->kernel_width,
       max_pooling_op->dilation_width,
       max_pooling_op->stride_width);
-  max_pooling_op->output = output;
-
-  size_t valid_batch_size = 0;
-  if (input == max_pooling_op->last_input &&
-      input_height == max_pooling_op->last_input_height &&
-      input_width == max_pooling_op->last_input_width)
-  {
-    valid_batch_size = max_pooling_op->valid_batch_size;
-    if (batch_size <= valid_batch_size) {
-      max_pooling_op->compute.range[0] = batch_size;
-      max_pooling_op->state = xnn_run_state_ready;
-      return xnn_status_success;
-    }
-  }
 
   const size_t pooling_height = max_pooling_op->kernel_height;
   const size_t pooling_width = max_pooling_op->kernel_width;
   const size_t pooling_size = pooling_height * pooling_width;
   const size_t output_height = max_pooling_op->output_height;
   const size_t output_width = max_pooling_op->output_width;
-  // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
-  const uint32_t mr = xnn_params.f32.maxpool.mr;
+  const uint32_t mr = maxpool->mr;
 
   const size_t step_width =
     max_pooling_op->dilation_width > 1 ? pooling_width : min(max_pooling_op->stride_width, pooling_width);
   const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
-  const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
 
-  const void** indirection_buffer = (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
-  if (indirection_buffer == NULL) {
-    xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
-    return xnn_status_out_of_memory;
+  if (input_height != max_pooling_op->last_input_height ||
+      input_width != max_pooling_op->last_input_width)
+  {
+    // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
+    const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
+    const void** indirection_buffer = (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
+    if (indirection_buffer == NULL) {
+      xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
+      return xnn_status_out_of_memory;
+    }
+    max_pooling_op->indirection_buffer = indirection_buffer;
+
+    xnn_indirection_init_maxpool2d(max_pooling_op, step_height, step_width, log2_input_element_size);
+
+    max_pooling_op->last_input = input;
+    max_pooling_op->last_input_height = input_height;
+    max_pooling_op->last_input_width = input_width;
   }
-  max_pooling_op->indirection_buffer = indirection_buffer;
 
-  xnn_indirection_init_maxpool2d(max_pooling_op, valid_batch_size, step_height, step_width, 2 /* log2(sizeof(float)) */);
-
-  const uint32_t qr = xnn_params.f32.maxpool.qr;
+  const uint32_t qr = maxpool->qr;
   const size_t channels = max_pooling_op->channels;
 
   const size_t indirect_input_height_stride = step_height * sizeof(void*);
-  const size_t output_width_stride = max_pooling_op->output_pixel_stride * sizeof(float);
+  const size_t output_width_stride = max_pooling_op->output_pixel_stride << log2_output_element_size;
   const size_t output_height_stride = output_width * output_width_stride;
   const size_t multipass_adjustment = round_up(doz(pooling_size, mr), qr) + mr;
 
   max_pooling_op->context.max_pooling = (struct max_pooling_context) {
-      .indirect_input = indirection_buffer,
-      .indirect_input_batch_stride = output_height * indirect_input_height_stride,
-      .indirect_input_height_stride = indirect_input_height_stride,
-      .output = output,
-      .output_batch_stride = output_height * output_height_stride,
-      .output_height_stride = output_height_stride,
-      .output_width = output_width,
-      .pooling_size = pooling_size,
-      .channels = channels,
-      .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
-      .output_increment = output_width_stride - channels * sizeof(float),
-      .params.f32 = max_pooling_op->f32_output_params,
-      .ukernel = xnn_params.f32.maxpool.ukernel,
+    .indirect_input = max_pooling_op->indirection_buffer,
+    .indirect_input_height_stride = indirect_input_height_stride,
+    .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) max_pooling_op->last_input),
+    .input_batch_stride = (input_height * input_width * max_pooling_op->input_pixel_stride) << log2_input_element_size,
+    .output = output,
+    .output_batch_stride = output_height * output_height_stride,
+    .output_height_stride = output_height_stride,
+    .output_width = output_width,
+    .pooling_size = pooling_size,
+    .channels = channels,
+    .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
+    .output_increment = output_width_stride - (channels << log2_output_element_size),
+    .ukernel = maxpool->ukernel,
   };
+  memcpy(&max_pooling_op->context.max_pooling.params, params, sizeof(max_pooling_op->context.max_pooling.params));
+
   max_pooling_op->compute.type = xnn_parallelization_type_2d;
   max_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_max_pooling;
   max_pooling_op->compute.range[0] = batch_size;
   max_pooling_op->compute.range[1] = output_height;
   max_pooling_op->state = xnn_run_state_ready;
 
-  max_pooling_op->last_input = input;
-  max_pooling_op->last_input_height = input_height;
-  max_pooling_op->last_input_width = input_width;
-  max_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
-
   return xnn_status_success;
 }
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
+    xnn_operator_t max_pooling_op,
+    size_t batch_size,
+    size_t input_height,
+    size_t input_width,
+    const uint8_t* input,
+    uint8_t* output,
+    pthreadpool_t threadpool)
+{
+  if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_u8) {
+    xnn_log_error("failed to setup Max Pooling (NHWC, U8) operator: operator type mismatch");
+    return xnn_status_invalid_parameter;
+  }
+
+  return setup_max_pooling2d(
+    max_pooling_op,
+    batch_size, input_height, input_width, 
+    input, output,
+    0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
+    0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
+    &xnn_params.u8.maxpool,
+    &max_pooling_op->u8_output_params,
+    pthreadpool_get_threads_count(threadpool));
+}
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
+    xnn_operator_t max_pooling_op,
+    size_t batch_size,
+    size_t input_height,
+    size_t input_width,
+    const float* input,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f32) {
+    xnn_log_error("failed to setup Max Pooling (NHWC, F32) operator: operator type mismatch");
+    return xnn_status_invalid_parameter;
+  }
+
+  return setup_max_pooling2d(
+    max_pooling_op,
+    batch_size, input_height, input_width, 
+    input, output,
+    2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
+    2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
+    &xnn_params.f32.maxpool,
+    &max_pooling_op->f32_output_params,
+    pthreadpool_get_threads_count(threadpool));
+}
+
diff --git a/src/operator-run.c b/src/operator-run.c
index 28c0b5a..b4f8dbe 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -275,17 +275,17 @@
     size_t batch_index,
     size_t output_y)
 {
-  const void** indirect_input =
-    (const void**) ((uintptr_t) context->indirect_input +
-      batch_index * context->indirect_input_batch_stride + output_y * context->indirect_input_height_stride);
-  void* output =
-    (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride + output_y * context->output_height_stride);
-  uint32_t* index =
-    (uint32_t*) ((uintptr_t) context->index + batch_index * context->index_batch_stride + output_y * context->index_height_stride);
+  const void** indirect_input = (const void**) ((uintptr_t) context->indirect_input +
+    output_y * context->indirect_input_height_stride);
+  const size_t input_offset = context->input_offset + batch_index * context->input_batch_stride;
+  void* output = (void*) ((uintptr_t) context->output +
+    batch_index * context->output_batch_stride + output_y * context->output_height_stride);
+  uint32_t* index = (uint32_t*) ((uintptr_t) context->index +
+    batch_index * context->index_batch_stride + output_y * context->index_height_stride);
 
   context->unipass_ukernel(
     context->output_width, context->pooling_size, context->channels,
-    indirect_input, output, index,
+    indirect_input, input_offset, output, index,
     context->input_increment, context->output_increment,
     &context->params);
 }
@@ -295,20 +295,20 @@
     size_t batch_index,
     size_t output_y)
 {
-  const void** indirect_input =
-    (const void**) ((uintptr_t) context->indirect_input +
-      batch_index * context->indirect_input_batch_stride + output_y * context->indirect_input_height_stride);
-  void* output =
-    (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride + output_y * context->output_height_stride);
-  uint32_t* index =
-    (uint32_t*) ((uintptr_t) context->index + batch_index * context->index_batch_stride + output_y * context->index_height_stride);
+  const void** indirect_input = (const void**) ((uintptr_t) context->indirect_input +
+    output_y * context->indirect_input_height_stride);
+  const size_t input_offset = context->input_offset + batch_index * context->input_batch_stride;  
+  void* output = (void*) ((uintptr_t) context->output +
+    batch_index * context->output_batch_stride + output_y * context->output_height_stride);
+  uint32_t* index = (uint32_t*) ((uintptr_t) context->index +
+    batch_index * context->index_batch_stride + output_y * context->index_height_stride);
 
-  XNN_ALIGN(16) float multipass_output_buffer[context->channels + XNN_EXTRA_BYTES / sizeof(float)];
+  XNN_ALIGN(16) float multipass_accumulation_buffer[context->channels + XNN_EXTRA_BYTES / sizeof(float)];
   XNN_ALIGN(16) uint32_t multipass_index_buffer[context->channels + XNN_EXTRA_BYTES / sizeof(uint32_t)];
 
   context->multipass_ukernel(
     context->output_width, context->pooling_size, context->channels,
-    indirect_input, multipass_output_buffer, multipass_index_buffer, output, index,
+    indirect_input, input_offset, multipass_accumulation_buffer, multipass_index_buffer, output, index,
     context->input_increment, context->output_increment,
     &context->params);
 }
@@ -318,15 +318,15 @@
     size_t batch_index,
     size_t output_y)
 {
-  const void** indirect_input =
-    (const void**) ((uintptr_t) context->indirect_input +
-      batch_index * context->indirect_input_batch_stride + output_y * context->indirect_input_height_stride);
-  void* output =
-    (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride + output_y * context->output_height_stride);
+  const void** indirect_input = (const void**) ((uintptr_t) context->indirect_input +
+    output_y * context->indirect_input_height_stride);
+  const size_t input_offset = context->input_offset + batch_index * context->input_batch_stride;
+  void* output = (void*) ((uintptr_t) context->output +
+    batch_index * context->output_batch_stride + output_y * context->output_height_stride);
 
   context->ukernel(
     context->output_width, context->pooling_size, context->channels,
-    indirect_input, output,
+    indirect_input, input_offset, output,
     context->input_increment, context->output_increment,
     &context->params);
 }
diff --git a/src/u8-maxpool/9p8q-neon.c b/src/u8-maxpool/9p8x-neon-c16.c
similarity index 75%
rename from src/u8-maxpool/9p8q-neon.c
rename to src/u8-maxpool/9p8x-neon-c16.c
index 9deaf24..8edba53 100644
--- a/src/u8-maxpool/9p8q-neon.c
+++ b/src/u8-maxpool/9p8x-neon-c16.c
@@ -13,19 +13,20 @@
 #include <xnnpack/maxpool.h>
 
 
-void xnn_u8_maxpool_ukernel_9p8q__neon(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_u8_maxpool_ukernel_9p8x__neon_c16(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
     const uint8_t** input,
+    size_t input_offset,
     uint8_t* output,
     size_t input_increment,
     size_t output_increment,
     const union xnn_u8_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
 
   const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.max);
   const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.min);
@@ -41,33 +42,42 @@
       const uint8_t* i6 = *input++;
       const uint8_t* i7 = *input++;
       const uint8_t* i8 = *input++;
-      if (ks < 2) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
         i1 = i0;
       }
-      if (ks <= 2) {
+      if (kernel_elements <= 2) {
         i2 = i0;
       }
-      if (ks < 4) {
+      if (kernel_elements < 4) {
         i3 = i0;
       }
-      if (ks <= 4) {
+      if (kernel_elements <= 4) {
         i4 = i0;
       }
-      if (ks < 6) {
+      if (kernel_elements < 6) {
         i5 = i0;
       }
-      if (ks <= 6) {
+      if (kernel_elements <= 6) {
         i6 = i0;
       }
-      if (ks < 8) {
+      if (kernel_elements < 8) {
         i7 = i0;
       }
-      if (ks <= 8) {
+      if (kernel_elements <= 8) {
         i8 = i0;
       }
 
-      size_t k = kc;
-      for (; k >= 16; k -= 16) {
+      size_t c = channels;
+      for (; c >= 16; c -= 16) {
         const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16;
         const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16;
         const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16;
@@ -90,7 +100,7 @@
 
         vst1q_u8(o, vout); o += 16;
       }
-      if (k != 0) {
+      if (c != 0) {
         const uint8x16_t vi0 = vld1q_u8(i0);
         const uint8x16_t vi1 = vld1q_u8(i1);
         const uint8x16_t vi2 = vld1q_u8(i2);
@@ -112,25 +122,25 @@
         const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);
 
         uint8x8_t vout_lo = vget_low_u8(vout);
-        if (k & 8) {
+        if (c & 8) {
           vst1_u8(o, vout_lo); o += 8;
           vout_lo = vget_high_u8(vout);
         }
-        if (k & 4) {
+        if (c & 4) {
           vst1_lane_u32(__builtin_assume_aligned(o, 1), vreinterpret_u32_u8(vout_lo), 0); o += 4;
           vout_lo = vext_u8(vout_lo, vout_lo, 4);
         }
-        if (k & 2) {
+        if (c & 2) {
           vst1_lane_u16(__builtin_assume_aligned(o, 1), vreinterpret_u16_u8(vout_lo), 0); o += 2;
           vout_lo = vext_u8(vout_lo, vout_lo, 2);
         }
-        if (k & 1) {
+        if (c & 1) {
           vst1_lane_u8(o, vout_lo, 0); o += 1;
         }
       }
     }
 
-    for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
       const uint8_t* i0 = *input++;
       const uint8_t* i1 = *input++;
       const uint8_t* i2 = *input++;
@@ -139,31 +149,39 @@
       const uint8_t* i5 = *input++;
       const uint8_t* i6 = *input++;
       const uint8_t* i7 = *input++;
-      if (m < 2) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m < 8) {
+      if (k < 8) {
         i7 = i0;
       }
 
       o = output;
-      size_t k = kc;
-      for (; k >= 16; k -= 16) {
+      size_t c = channels;
+      for (; c >= 16; c -= 16) {
         const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16;
         const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16;
         const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16;
@@ -186,7 +204,7 @@
 
         vst1q_u8(o, vout); o += 16;
       }
-      if (k != 0) {
+      if (c != 0) {
         const uint8x16_t vi0 = vld1q_u8(i0);
         const uint8x16_t vi1 = vld1q_u8(i1);
         const uint8x16_t vi2 = vld1q_u8(i2);
@@ -208,24 +226,24 @@
         const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);
 
         uint8x8_t vout_lo = vget_low_u8(vout);
-        if (k & 8) {
+        if (c & 8) {
           vst1_u8(o, vout_lo); o += 8;
           vout_lo = vget_high_u8(vout);
         }
-        if (k & 4) {
+        if (c & 4) {
           vst1_lane_u32(__builtin_assume_aligned(o, 1), vreinterpret_u32_u8(vout_lo), 0); o += 4;
           vout_lo = vext_u8(vout_lo, vout_lo, 4);
         }
-        if (k & 2) {
+        if (c & 2) {
           vst1_lane_u16(__builtin_assume_aligned(o, 1), vreinterpret_u16_u8(vout_lo), 0); o += 2;
           vout_lo = vext_u8(vout_lo, vout_lo, 2);
         }
-        if (k & 1) {
+        if (c & 1) {
           vst1_lane_u8(o, vout_lo, 0); o += 1;
         }
       }
     }
     input = (const uint8_t**) ((uintptr_t) input + input_increment);
     output = (uint8_t*) ((uintptr_t) o + output_increment);
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/u8-maxpool/9p8q-scalar.c b/src/u8-maxpool/9p8x-scalar-c1.c
similarity index 65%
rename from src/u8-maxpool/9p8q-scalar.c
rename to src/u8-maxpool/9p8x-scalar-c1.c
index e442f75..10112a8 100644
--- a/src/u8-maxpool/9p8q-scalar.c
+++ b/src/u8-maxpool/9p8x-scalar-c1.c
@@ -8,19 +8,20 @@
 #include <xnnpack/maxpool.h>
 
 
-void xnn_u8_maxpool_ukernel_9p8q__scalar(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_u8_maxpool_ukernel_9p8x__scalar_c1(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
     const uint8_t** input,
+    size_t input_offset,
     uint8_t* output,
     size_t input_increment,
     size_t output_increment,
     const union xnn_u8_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
 
   const uint8_t voutput_max = params->scalar.max;
   const uint8_t voutput_min = params->scalar.min;
@@ -36,32 +37,41 @@
       const uint8_t* i6 = *input++;
       const uint8_t* i7 = *input++;
       const uint8_t* i8 = *input++;
-      if (ks < 2) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
         i1 = i0;
       }
-      if (ks <= 2) {
+      if (kernel_elements <= 2) {
         i2 = i0;
       }
-      if (ks < 4) {
+      if (kernel_elements < 4) {
         i3 = i0;
       }
-      if (ks <= 4) {
+      if (kernel_elements <= 4) {
         i4 = i0;
       }
-      if (ks < 6) {
+      if (kernel_elements < 6) {
         i5 = i0;
       }
-      if (ks <= 6) {
+      if (kernel_elements <= 6) {
         i6 = i0;
       }
-      if (ks < 8) {
+      if (kernel_elements < 8) {
         i7 = i0;
       }
-      if (ks <= 8) {
+      if (kernel_elements <= 8) {
         i8 = i0;
       }
 
-      size_t k = kc;
+      size_t c = channels;
       do {
         const uint8_t vi0 = *i0++;
         const uint8_t vi1 = *i1++;
@@ -87,10 +97,10 @@
         vout = vout < voutput_min ? voutput_min : vout;
 
         *o++ = vout;
-      } while (--k != 0);
+      } while (--c != 0);
     }
 
-    for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
       const uint8_t* i0 = *input++;
       const uint8_t* i1 = *input++;
       const uint8_t* i2 = *input++;
@@ -99,30 +109,38 @@
       const uint8_t* i5 = *input++;
       const uint8_t* i6 = *input++;
       const uint8_t* i7 = *input++;
-      if (m < 2) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m < 8) {
+      if (k < 8) {
         i7 = i0;
       }
 
       o = output;
-      size_t k = kc;
+      size_t c = channels;
       do {
         const uint8_t vi0 = *i0++;
         const uint8_t vi1 = *i1++;
@@ -148,9 +166,9 @@
         vout = vout < voutput_min ? voutput_min : vout;
 
         *o++ = vout;
-      } while (--k != 0);
+      } while (--c != 0);
     }
     input = (const uint8_t**) ((uintptr_t) input + input_increment);
     output = (uint8_t*) ((uintptr_t) o + output_increment);
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/u8-maxpool/9p8q-sse2.c b/src/u8-maxpool/9p8x-sse2-c16.c
similarity index 78%
rename from src/u8-maxpool/9p8q-sse2.c
rename to src/u8-maxpool/9p8x-sse2-c16.c
index 8903457..7d3ee15 100644
--- a/src/u8-maxpool/9p8q-sse2.c
+++ b/src/u8-maxpool/9p8x-sse2-c16.c
@@ -13,19 +13,20 @@
 #include <xnnpack/maxpool.h>
 
 
-void xnn_u8_maxpool_ukernel_9p8q__sse2(
-    size_t n,
-    size_t ks,
-    size_t kc,
+void xnn_u8_maxpool_ukernel_9p8x__sse2_c16(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
     const uint8_t** input,
+    size_t input_offset,
     uint8_t* output,
     size_t input_increment,
     size_t output_increment,
     const union xnn_u8_output_params params[restrict static 1])
 {
-  assert(n != 0);
-  assert(ks != 0);
-  assert(kc != 0);
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
 
   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.max);
   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.min);
@@ -42,33 +43,42 @@
       const uint8_t* i6 = *input++;
       const uint8_t* i7 = *input++;
       const uint8_t* i8 = *input++;
-      if (ks < 2) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
         i1 = i0;
       }
-      if (ks <= 2) {
+      if (kernel_elements <= 2) {
         i2 = i0;
       }
-      if (ks < 4) {
+      if (kernel_elements < 4) {
         i3 = i0;
       }
-      if (ks <= 4) {
+      if (kernel_elements <= 4) {
         i4 = i0;
       }
-      if (ks < 6) {
+      if (kernel_elements < 6) {
         i5 = i0;
       }
-      if (ks <= 6) {
+      if (kernel_elements <= 6) {
         i6 = i0;
       }
-      if (ks < 8) {
+      if (kernel_elements < 8) {
         i7 = i0;
       }
-      if (ks <= 8) {
+      if (kernel_elements <= 8) {
         i8 = i0;
       }
 
-      size_t k = kc;
-      for (; k >= 16; k -= 16) {
+      size_t c = channels;
+      for (; c >= 16; c -= 16) {
         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
@@ -91,7 +101,7 @@
 
         _mm_storeu_si128((__m128i*) o, vout); o += 16;
       }
-      if (k != 0) {
+      if (c != 0) {
         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
@@ -112,29 +122,29 @@
         const __m128i vmax = _mm_max_epu8(vmax2345, vmax01678);
         __m128i vout = _mm_max_epu8(_mm_min_epu8(vmax, voutput_max), voutput_min);
 
-        if (k & 8) {
+        if (c & 8) {
           _mm_storel_epi64((__m128i*) o, vout);
           vout = _mm_unpackhi_epi64(vout, vout);
           o += 8;
         }
-        if (k & 4) {
+        if (c & 4) {
           *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vout);
           vout = _mm_srli_epi64(vout, 32);
           o += 4;
         }
-        if (k & 2) {
+        if (c & 2) {
           *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vout, 0);
           vout = _mm_srli_epi32(vout, 16);
           o += 2;
         }
-        if (k & 1) {
+        if (c & 1) {
           *((uint8_t*) o) = (uint8_t) _mm_cvtsi128_si32(vout);
           o += 1;
         }
       }
     }
 
-    for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
       const uint8_t* i0 = *input++;
       const uint8_t* i1 = *input++;
       const uint8_t* i2 = *input++;
@@ -143,31 +153,39 @@
       const uint8_t* i5 = *input++;
       const uint8_t* i6 = *input++;
       const uint8_t* i7 = *input++;
-      if (m < 2) {
+      i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+      i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+      i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+      i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+      i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+      i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+      i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+      i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
         i1 = i0;
       }
-      if (m <= 2) {
+      if (k <= 2) {
         i2 = i0;
       }
-      if (m < 4) {
+      if (k < 4) {
         i3 = i0;
       }
-      if (m <= 4) {
+      if (k <= 4) {
         i4 = i0;
       }
-      if (m < 6) {
+      if (k < 6) {
         i5 = i0;
       }
-      if (m <= 6) {
+      if (k <= 6) {
         i6 = i0;
       }
-      if (m < 8) {
+      if (k < 8) {
         i7 = i0;
       }
 
       o = output;
-      size_t k = kc;
-      for (; k >= 16; k -= 16) {
+      size_t c = channels;
+      for (; c >= 16; c -= 16) {
         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
@@ -191,7 +209,7 @@
         _mm_storeu_si128((__m128i*) o, vout);
         o += 16;
       }
-      if (k != 0) {
+      if (c != 0) {
         const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
         const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
         const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
@@ -212,22 +230,22 @@
         const __m128i vmax = _mm_max_epu8(vmax2345, vmax0167);
         __m128i vout = _mm_max_epu8(_mm_min_epu8(vmax, voutput_max), voutput_min);
 
-        if (k & 8) {
+        if (c & 8) {
           _mm_storel_epi64((__m128i*) o, vout);
           vout = _mm_unpackhi_epi64(vout, vout);
           o += 8;
         }
-        if (k & 4) {
+        if (c & 4) {
           *((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vout);
           vout = _mm_srli_epi64(vout, 32);
           o += 4;
         }
-        if (k & 2) {
+        if (c & 2) {
           *((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vout, 0);
           vout = _mm_srli_epi32(vout, 16);
           o += 2;
         }
-        if (k & 1) {
+        if (c & 1) {
           *((uint8_t*) o) = (uint8_t) _mm_cvtsi128_si32(vout);
           o += 1;
         }
@@ -235,5 +253,5 @@
     }
     input = (const uint8_t**) ((uintptr_t) input + input_increment);
     output = (uint8_t*) ((uintptr_t) o + output_increment);
-  } while (--n != 0);
+  } while (--output_pixels != 0);
 }
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
index ce60230..056114c 100644
--- a/src/xnnpack/argmaxpool.h
+++ b/src/xnnpack/argmaxpool.h
@@ -18,41 +18,44 @@
 
 #define DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                                     \
-      size_t n,                                                  \
-      size_t ks,                                                 \
-      size_t kc,                                                 \
-      const float** x,                                           \
-      float* y,                                                  \
-      uint32_t* i,                                               \
-      size_t x_increment,                                        \
-      size_t y_increment,                                        \
+      size_t output_pixels,                                      \
+      size_t kernel_elements,                                    \
+      size_t channels,                                           \
+      const float** input,                                       \
+      size_t input_offset,                                       \
+      float* output,                                             \
+      uint32_t* index,                                           \
+      size_t input_increment,                                    \
+      size_t output_increment,                                   \
       const union xnn_f32_output_params* params);
 
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__psimd)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__scalar)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__sse2)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__psimd)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__scalar)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__sse2)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_4x__psimd_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_4x__sse2_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_4x__scalar_c1)
+
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9x__psimd_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9x__sse2_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9x__scalar_c1)
 
 
 #define DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                                       \
-      size_t n,                                                    \
-      size_t ks,                                                   \
-      size_t kc,                                                   \
-      const float** x,                                             \
-      float* ab,                                                   \
-      uint32_t* ib,                                                \
-      float* y,                                                    \
-      uint32_t* i,                                                 \
-      size_t x_increment,                                          \
-      size_t y_increment,                                          \
+      size_t output_pixels,                                        \
+      size_t kernel_elements,                                      \
+      size_t channels,                                             \
+      const float** input,                                         \
+      size_t input_offset,                                         \
+      float* accumulation_buffer,                                  \
+      uint32_t* index_buffer,                                      \
+      float* output,                                               \
+      uint32_t* index,                                             \
+      size_t input_increment,                                      \
+      size_t output_increment,                                     \
       const union xnn_f32_output_params* params);
 
-DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd)
-DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar)
-DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1)
 
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 2f7e31a..8eb4fa4 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -292,8 +292,9 @@
 
 struct max_pooling_context {
   const void** indirect_input;
-  size_t indirect_input_batch_stride;
   size_t indirect_input_height_stride;
+  size_t input_offset;
+  size_t input_batch_stride;
   void* output;
   size_t output_batch_stride;
   size_t output_height_stride;
@@ -341,8 +342,9 @@
 
 struct argmax_pooling_context {
   const void** indirect_input;
-  size_t indirect_input_batch_stride;
   size_t indirect_input_height_stride;
+  size_t input_offset;
+  size_t input_batch_stride;
   void* output;
   size_t output_batch_stride;
   size_t output_height_stride;
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
index d9fd0ed..868da5c 100644
--- a/src/xnnpack/indirection.h
+++ b/src/xnnpack/indirection.h
@@ -43,7 +43,6 @@
 
 XNN_INTERNAL void xnn_indirection_init_maxpool2d(
   xnn_operator_t op,
-  size_t batch_start,
   size_t step_height,
   size_t step_width,
   uint32_t log2_element_size);
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index 1cac764..6013cba 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h
@@ -21,34 +21,36 @@
 
 #define DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                          \
-      size_t n,                                       \
-      size_t ks,                                      \
-      size_t kc,                                      \
-      const float** x,                                \
-      float* y,                                       \
-      size_t x_increment,                             \
-      size_t y_increment,                             \
+      size_t output_pixels,                           \
+      size_t kernel_size,                             \
+      size_t channels,                                \
+      const float** input,                            \
+      size_t input_offset,                            \
+      float* output,                                  \
+      size_t input_increment,                         \
+      size_t output_increment,                        \
       const union xnn_f32_output_params* params);
 
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__psimd)
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__scalar)
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__sse)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__psimd_c4)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__sse_c4)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__scalar_c1)
 
 
 #define DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                         \
-      size_t n,                                      \
-      size_t ks,                                     \
-      size_t kc,                                     \
-      const uint8_t** x,                             \
-      uint8_t* y,                                    \
-      size_t x_increment,                            \
-      size_t y_increment,                            \
+      size_t output_pixels,                          \
+      size_t kernel_size,                            \
+      size_t channels,                               \
+      const uint8_t** input,                         \
+      size_t input_offset,                           \
+      uint8_t* output,                               \
+      size_t input_increment,                        \
+      size_t output_increment,                       \
       const union xnn_u8_output_params* params);
 
-DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__neon)
-DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__sse2)
-DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__scalar)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8x__neon_c16)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8x__sse2_c16)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8x__scalar_c1)
 
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 88e56d0..c76315e 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -926,81 +926,88 @@
     const union xnn_f32_output_params* params);
 
 typedef void (*xnn_maxpool_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const void** x,
-    void* y,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const void** input,
+    size_t input_offset,
+    void* output,
+    size_t input_increment,
+    size_t output_increment,
     const void* params);
 
 typedef void (*xnn_f32_maxpool_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const float** x,
-    float* y,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const float** input,
+    size_t input_offset,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
     const union xnn_f32_output_params* params);
 
 typedef void (*xnn_u8_maxpool_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const uint8_t** x,
-    uint8_t* y,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const uint8_t** input,
+    size_t input_offset,
+    uint8_t* output,
+    size_t input_increment,
+    size_t output_increment,
     const union xnn_u8_output_params* params);
 
 typedef void (*xnn_argmaxpool_up_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const void** x,
-    void* y,
-    uint32_t* i,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const void** input,
+    size_t input_offset,
+    void* output,
+    uint32_t* index,
+    size_t input_increment,
+    size_t output_increment,
     const void* params);
 
 typedef void (*xnn_f32_argmaxpool_up_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const float** x,
-    float* y,
-    uint32_t* i,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const float** input,
+    size_t input_offset,
+    float* output,
+    uint32_t* index,
+    size_t input_increment,
+    size_t output_increment,
     const union xnn_f32_output_params* params);
 
 typedef void (*xnn_argmaxpool_mp_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const void** x,
-    void* ab,
-    uint32_t* ib,
-    void* y,
-    uint32_t* i,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const void** input,
+    size_t input_offset,
+    void* accumulation_buffer,
+    uint32_t* index_buffer,
+    void* output,
+    uint32_t* index,
+    size_t input_increment,
+    size_t output_increment,
     const void* params);
 
 typedef void (*xnn_f32_argmaxpool_mp_ukernel_function)(
-    size_t n,
-    size_t ks,
-    size_t kc,
-    const float** x,
-    float* ab,
-    uint32_t* ib,
-    float* y,
-    uint32_t* i,
-    size_t x_increment,
-    size_t y_increment,
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const float** input,
+    size_t input_offset,
+    float* accumulation_buffer,
+    uint32_t* index_buffer,
+    float* output,
+    uint32_t* index,
+    size_t input_increment,
+    size_t output_increment,
     const union xnn_f32_output_params* params);
 
 typedef void (*xnn_univector_ukernel_function)(
diff --git a/test/argmaxpool-microkernel-tester.h b/test/argmaxpool-microkernel-tester.h
index e84d11a..562d176 100644
--- a/test/argmaxpool-microkernel-tester.h
+++ b/test/argmaxpool-microkernel-tester.h
@@ -21,126 +21,121 @@
 #include <xnnpack/params.h>
 
 
-class ArgmaxPoolMicrokernelTester {
+class ArgMaxPoolMicrokernelTester {
  public:
   enum class Variant {
     Native,
     Scalar,
   };
 
-  inline ArgmaxPoolMicrokernelTester& n(size_t n) {
-    assert(n != 0);
-    this->n_ = n;
+  inline ArgMaxPoolMicrokernelTester& output_pixels(size_t output_pixels) {
+    assert(output_pixels != 0);
+    this->output_pixels_ = output_pixels;
     return *this;
   }
 
-  inline size_t n() const {
-    return this->n_;
+  inline size_t output_pixels() const {
+    return this->output_pixels_;
   }
 
-  inline ArgmaxPoolMicrokernelTester& s(size_t s) {
-    assert(s != 0);
-    this->s_ = s;
+  inline ArgMaxPoolMicrokernelTester& step(size_t step) {
+    assert(step != 0);
+    this->step_ = step;
     return *this;
   }
 
-  inline size_t s() const {
-    return this->s_;
+  inline size_t step() const {
+    return this->step_;
   }
 
-  inline ArgmaxPoolMicrokernelTester& kh(size_t kh) {
-    assert(kh != 0);
-    this->kh_ = kh;
+  inline ArgMaxPoolMicrokernelTester& input_offset(size_t input_offset) {
+    assert(input_offset != 0);
+    this->input_offset_ = input_offset;
     return *this;
   }
 
-  inline size_t kh() const {
-    return this->kh_;
+  inline size_t input_offset() const {
+    return this->input_offset_;
   }
 
-  inline ArgmaxPoolMicrokernelTester& kw(size_t kw) {
-    assert(kw != 0);
-    this->kw_ = kw;
+  inline ArgMaxPoolMicrokernelTester& pooling_elements(size_t pooling_elements) {
+    assert(pooling_elements != 0);
+    this->pooling_elements_ = pooling_elements;
     return *this;
   }
 
-  inline size_t kw() const {
-    return this->kw_;
+  inline size_t pooling_elements() const {
+    return this->pooling_elements_;
   }
 
-  inline size_t ks() const {
-    return kh() * kw();
-  }
-
-  inline size_t packed_ks() const {
-    if (ks() <= mr()) {
-      return mr();
+  inline size_t packed_pooling_elements() const {
+    if (pooling_elements() <= primary_pooling_tile()) {
+      return primary_pooling_tile();
     } else {
-      return (ks() - mr()) % qr() == 0 ? ks() : ((ks() - mr()) / qr() + 1) * qr() + mr();
+      return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile();
     }
   }
 
-  inline ArgmaxPoolMicrokernelTester& mr(size_t mr) {
-    assert(mr != 0);
-    this->mr_ = mr;
+  inline ArgMaxPoolMicrokernelTester& pooling_tile(size_t primary_tile) {
+    assert(primary_tile != 0);
+    this->primary_pooling_tile_ = primary_tile;
+    this->incremental_pooling_tile_ = 0;
     return *this;
   }
 
-  inline size_t mr() const {
-    return this->mr_;
-  }
-
-  inline ArgmaxPoolMicrokernelTester& qr(size_t qr) {
-    assert(qr != 0);
-    this->qr_ = qr;
+  inline ArgMaxPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile) {
+    assert(primary_tile != 0);
+    this->primary_pooling_tile_ = primary_tile;
+    this->incremental_pooling_tile_ = incremental_tile;
     return *this;
   }
 
-  inline size_t qr() const {
-    return this->qr_;
-  }
-
-  inline ArgmaxPoolMicrokernelTester& kc(size_t kc) {
-    assert(kc != 0);
-    this->kc_ = kc;
+  inline ArgMaxPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) {
+    assert(primary_pooling_tile != 0);
+    this->primary_pooling_tile_ = primary_pooling_tile;
     return *this;
   }
 
-  inline size_t kc() const {
-    return this->kc_;
+  inline size_t primary_pooling_tile() const {
+    return this->primary_pooling_tile_;
   }
 
-  inline ArgmaxPoolMicrokernelTester& x_stride(size_t x_stride) {
-    assert(x_stride != 0);
-    this->x_stride_ = x_stride;
+  inline ArgMaxPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) {
+    assert(incremental_pooling_tile != 0);
+    this->incremental_pooling_tile_ = incremental_pooling_tile;
     return *this;
   }
 
-  inline size_t x_stride() const {
-    if (this->x_stride_ == 0) {
-      return kc();
+  inline size_t incremental_pooling_tile() const {
+    return this->incremental_pooling_tile_;
+  }
+
+  inline ArgMaxPoolMicrokernelTester& channels(size_t channels) {
+    assert(channels != 0);
+    this->channels_ = channels;
+    return *this;
+  }
+
+  inline size_t channels() const {
+    return this->channels_;
+  }
+
+  inline ArgMaxPoolMicrokernelTester& output_stride(size_t output_stride) {
+    assert(output_stride != 0);
+    this->output_stride_ = output_stride;
+    return *this;
+  }
+
+  inline size_t output_stride() const {
+    if (this->output_stride_ == 0) {
+      return channels();
     } else {
-      assert(this->x_stride_ >= kc());
-      return this->x_stride_;
+      assert(this->output_stride_ >= channels());
+      return this->output_stride_;
     }
   }
 
-  inline ArgmaxPoolMicrokernelTester& y_stride(size_t y_stride) {
-    assert(y_stride != 0);
-    this->y_stride_ = y_stride;
-    return *this;
-  }
-
-  inline size_t y_stride() const {
-    if (this->y_stride_ == 0) {
-      return kc();
-    } else {
-      assert(this->y_stride_ >= kc());
-      return this->y_stride_;
-    }
-  }
-
-  inline ArgmaxPoolMicrokernelTester& qmin(uint8_t qmin) {
+  inline ArgMaxPoolMicrokernelTester& qmin(uint8_t qmin) {
     this->qmin_ = qmin;
     return *this;
   }
@@ -149,7 +144,7 @@
     return this->qmin_;
   }
 
-  inline ArgmaxPoolMicrokernelTester& qmax(uint8_t qmax) {
+  inline ArgMaxPoolMicrokernelTester& qmax(uint8_t qmax) {
     this->qmax_ = qmax;
     return *this;
   }
@@ -158,7 +153,7 @@
     return this->qmax_;
   }
 
-  inline ArgmaxPoolMicrokernelTester& iterations(size_t iterations) {
+  inline ArgMaxPoolMicrokernelTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
   }
@@ -172,87 +167,95 @@
     auto rng = std::mt19937(random_device());
     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
 
-    std::vector<const float*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
-    std::vector<float> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-
-    std::vector<float> y((n() - 1) * y_stride() + kc());
-    std::vector<uint32_t> i(n() * kc());
-    std::vector<float> y_ref(n() * kc());
-    std::vector<uint32_t> i_ref(n() * kc());
+    std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+    std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+      ((output_pixels() - 1) * step() + pooling_elements()) * channels());
+    std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
+    std::vector<uint32_t> index(output_pixels() * channels());
+    std::vector<float> output_ref(output_pixels() * channels());
+    std::vector<uint32_t> index_ref(output_pixels() * channels());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), std::ref(f32rng));
-      std::fill(y.begin(), y.end(), nanf(""));
+      std::generate(input.begin(), input.end(), std::ref(f32rng));
+      std::fill(output.begin(), output.end(), nanf(""));
 
-      for (size_t p = 0; p < indirect_x.size(); p++) {
-        indirect_x[p] = x.data() + p * x_stride();
+      for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+        indirect_input[i] = input.data() + i * channels() - input_offset();
       }
-      std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+      std::shuffle(indirect_input.begin(),
+        indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
 
       // Compute reference results, without clamping.
-      for (size_t p = 0; p < n(); p++) {
-        for (size_t k = 0; k < kc(); k++) {
-          float max_value = indirect_x[p * s() * kh()][k];
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          float max_value = indirect_input[x * step()][c + input_offset()];
           uint32_t max_index = 0;
-          for (size_t j = 1; j < ks(); j++) {
-            const float value = indirect_x[p * s() * kh() + j][k];
+          for (size_t p = 0; p < pooling_elements(); p++) {
+            const float value = indirect_input[x * step() + p][c + input_offset()];
             if (value > max_value) {
               max_value = value;
-              max_index = j;
+              max_index = p;
             }
           }
-          y_ref[p * kc() + k] = max_value;
-          i_ref[p * kc() + k] = max_index;
+          output_ref[x * channels() + c] = max_value;
+          index_ref[x * channels() + c] = max_index;
         }
       }
 
       // Compute clamping parameters.
-      const float accumulated_min = *std::min_element(y_ref.cbegin(), y_ref.cend());
-      const float accumulated_max = *std::max_element(y_ref.cbegin(), y_ref.cend());
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
       const float accumulated_range = accumulated_max - accumulated_min;
-      const float y_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
-      const float y_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
+      const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
+      const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
 
       // Prepare output parameters.
       xnn_f32_output_params output_params = { };
       switch (variant) {
         case Variant::Native:
-          output_params = xnn_init_f32_output_params(y_min, y_max);
+          output_params = xnn_init_f32_output_params(output_min, output_max);
           break;
         case Variant::Scalar:
-          output_params = xnn_init_scalar_f32_output_params(y_min, y_max);
+          output_params = xnn_init_scalar_f32_output_params(output_min, output_max);
           break;
       }
 
       // Clamp reference results.
-      for (float& y_value : y_ref) {
-        y_value = std::max(std::min(y_value, y_max), y_min);
+      for (float& output_value : output_ref) {
+        output_value = std::max(std::min(output_value, output_max), output_min);
       }
 
       // Call optimized micro-kernel.
-      argmaxpool(n(), ks(), kc(),
-        indirect_x.data(), y.data(), i.data(),
-        kh() * s() * sizeof(void*),
-        (y_stride() - kc()) * sizeof(float),
+      argmaxpool(output_pixels(), pooling_elements(), channels(),
+        indirect_input.data(), input_offset() * sizeof(float), output.data(), index.data(),
+        step() * sizeof(void*),
+        (output_stride() - channels()) * sizeof(float),
         &output_params);
 
       // Verify results.
-      for (size_t p = 0; p < n(); p++) {
-        for (size_t k = 0; k < kc(); k++) {
-          ASSERT_GE(y[p * y_stride() + k], y_min)
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_LE(y[p * y_stride() + k], y_max)
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_EQ(y_ref[p * kc() + k], y[p * y_stride() + k])
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_EQ(indirect_x[p * s() * kh() + i_ref[p * kc() + k]][k], indirect_x[p * s() * kh() + i[p * kc() + k]][k])
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_EQ(i_ref[p * kc() + k], i[p * kc() + k])
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_GE(output[x * output_stride() + c], output_min)
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_LE(output[x * output_stride() + c], output_max)
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(output_ref[x * channels() + c], output[x * output_stride() + c])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(
+              indirect_input[x * step() + index_ref[x * channels() + c]][c + input_offset()],
+              indirect_input[x * step() + index[x * channels() + c]][c + input_offset()])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(index_ref[x * channels() + c], index[x * channels() + c])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
         }
       }
     }
@@ -263,105 +266,116 @@
     auto rng = std::mt19937(random_device());
     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
 
-    std::vector<const float*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
-    std::vector<float> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-
-    std::vector<float> y((n() - 1) * y_stride() + kc());
-    std::vector<uint32_t> i(n() * kc());
-    std::vector<uint32_t, AlignedAllocator<uint32_t, XNN_EXTRA_BYTES>> ib(kc() + XNN_EXTRA_BYTES / sizeof(uint32_t));
-    std::vector<float, AlignedAllocator<float, XNN_EXTRA_BYTES>> yb(kc() + XNN_EXTRA_BYTES / sizeof(float));
-    std::vector<float> y_ref(n() * kc());
-    std::vector<uint32_t> i_ref(n() * kc());
+    std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+    std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+      ((output_pixels() - 1) * step() + pooling_elements()) * channels());
+    std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
+    std::vector<uint32_t> index(output_pixels() * channels());
+    std::vector<uint32_t, AlignedAllocator<uint32_t, XNN_EXTRA_BYTES>> index_buffer(
+      channels() + XNN_EXTRA_BYTES / sizeof(uint32_t));
+    std::vector<float, AlignedAllocator<float, XNN_EXTRA_BYTES>> output_buffer(
+      channels() + XNN_EXTRA_BYTES / sizeof(float));
+    std::vector<float> output_ref(output_pixels() * channels());
+    std::vector<uint32_t> index_ref(output_pixels() * channels());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), std::ref(f32rng));
-      std::fill(y.begin(), y.end(), nanf(""));
+      std::generate(input.begin(), input.end(), std::ref(f32rng));
+      std::fill(output.begin(), output.end(), nanf(""));
 
-      for (size_t p = 0; p < indirect_x.size(); p++) {
-        indirect_x[p] = x.data() + p * x_stride();
+      for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+        indirect_input[i] = input.data() + i * channels() - input_offset();
       }
-      std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+      std::shuffle(indirect_input.begin(),
+        indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
 
       // Compute reference results, without clamping.
-      for (size_t p = 0; p < n(); p++) {
-        for (size_t k = 0; k < kc(); k++) {
-          float max_value = indirect_x[p * s() * kh()][k];
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          float max_value = indirect_input[x * step()][c + input_offset()];
           uint32_t max_index = 0;
-          for (size_t j = 1; j < ks(); j++) {
-            const float value = indirect_x[p * s() * kh() + j][k];
+          for (size_t p = 0; p < pooling_elements(); p++) {
+            const float value = indirect_input[x * step() + p][c + input_offset()];
             if (value > max_value) {
               max_value = value;
-              max_index = j;
+              max_index = p;
             }
           }
-          y_ref[p * kc() + k] = max_value;
-          i_ref[p * kc() + k] = max_index;
+          output_ref[x * channels() + c] = max_value;
+          index_ref[x * channels() + c] = max_index;
         }
       }
 
       // Compute clamping parameters.
-      const float accumulated_min = *std::min_element(y_ref.cbegin(), y_ref.cend());
-      const float accumulated_max = *std::max_element(y_ref.cbegin(), y_ref.cend());
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
       const float accumulated_range = accumulated_max - accumulated_min;
-      const float y_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
-      const float y_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
+      const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
+      const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
 
       // Prepare output parameters.
       xnn_f32_output_params output_params = { };
       switch (variant) {
         case Variant::Native:
-          output_params = xnn_init_f32_output_params(y_min, y_max);
+          output_params = xnn_init_f32_output_params(output_min, output_max);
           break;
         case Variant::Scalar:
-          output_params = xnn_init_scalar_f32_output_params(y_min, y_max);
+          output_params = xnn_init_scalar_f32_output_params(output_min, output_max);
           break;
       }
 
       // Clamp reference results.
-      for (float& y_value : y_ref) {
-        y_value = std::max(std::min(y_value, y_max), y_min);
+      for (float& output_value : output_ref) {
+        output_value = std::max(std::min(output_value, output_max), output_min);
       }
 
       // Call optimized micro-kernel.
-      argmaxpool(n(), ks(), kc(),
-        indirect_x.data(), yb.data(), ib.data(), y.data(), i.data(),
-        (kh() * s() - (packed_ks() - qr())) * sizeof(void*),
-        (y_stride() - kc()) * sizeof(float),
+      argmaxpool(output_pixels(), pooling_elements(), channels(),
+        indirect_input.data(), input_offset() * sizeof(float),
+        output_buffer.data(), index_buffer.data(),
+        output.data(), index.data(),
+        (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
+        (output_stride() - channels()) * sizeof(float),
         &output_params);
 
       // Verify results.
-      for (size_t p = 0; p < n(); p++) {
-        for (size_t k = 0; k < kc(); k++) {
-          ASSERT_GE(y[p * y_stride() + k], y_min)
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_LE(y[p * y_stride() + k], y_max)
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_EQ(y_ref[p * kc() + k], y[p * y_stride() + k])
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_EQ(indirect_x[p * s() * kh() + i_ref[p * kc() + k]][k], indirect_x[p * s() * kh() + i[p * kc() + k]][k])
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
-          ASSERT_EQ(i_ref[p * kc() + k], i[p * kc() + k])
-            << "at pixel " << p << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_GE(output[x * output_stride() + c], output_min)
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_LE(output[x * output_stride() + c], output_max)
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(output_ref[x * channels() + c], output[x * output_stride() + c])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(
+              indirect_input[x * step() + index_ref[x * channels() + c]][c + input_offset()],
+              indirect_input[x * step() + index[x * channels() + c]][c + input_offset()])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(index_ref[x * channels() + c], index[x * channels() + c])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
         }
       }
     }
   }
 
  private:
-  size_t n_{1};
-  size_t s_{1};
-  size_t kh_{1};
-  size_t kw_{1};
-  size_t mr_{1};
-  size_t qr_{1};
-  size_t kc_{1};
-  size_t x_stride_{0};
-  size_t y_stride_{0};
+  size_t output_pixels_{1};
+  size_t pooling_elements_{1};
+  size_t channels_{1};
+  size_t input_offset_{0};
+  size_t step_{1};
+  size_t primary_pooling_tile_{1};
+  size_t incremental_pooling_tile_{1};
+  size_t output_stride_{0};
   uint8_t qmin_{0};
   uint8_t qmax_{255};
-  size_t iterations_{15};
+  size_t iterations_{3};
 };
diff --git a/test/f32-argmaxpool.cc b/test/f32-argmaxpool.cc
index c1a8a91..1ec2610 100644
--- a/test/f32-argmaxpool.cc
+++ b/test/f32-argmaxpool.cc
@@ -2,6 +2,11 @@
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/f32-argmaxpool.yaml
+//   Generator: tools/generate-argmaxpool-test.py
+
 
 #include <gtest/gtest.h>
 
@@ -13,1309 +18,388 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_eq_4_fulltile) {
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(4)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(4)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
         }
       }
     }
   }
 
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_eq_4_subtile) {
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_input_offset) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .kc(4);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
         }
       }
     }
   }
 
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_fulltile) {
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_qmin) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
         }
       }
     }
   }
 
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_subtile) {
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_qmax) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-            }
-          }
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
         }
       }
     }
   }
 
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_fulltile_with_x_stride) {
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_output_stride) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
         }
       }
     }
   }
 
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_lt_4_fulltile) {
+  TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_step) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_lt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_lt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_gt_4_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_gt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_with_y_max) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .qmax(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_with_y_min) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .qmin(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n_with_y_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n_with_s) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .s(2)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .s(2)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .s(2)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_eq_4_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_eq_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .kc(4);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_lt_4_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_lt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_lt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_gt_4_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_gt_4_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_gt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_with_y_max) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(3)
-          .kw(3)
-          .kc(kc)
-          .qmax(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_with_y_min) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(3)
-          .kw(3)
-          .kc(kc)
-          .qmin(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(29)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n_with_y_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(31)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n_with_s) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          for (size_t s = 2; s <= ks; s++) {
-            ArgmaxPoolMicrokernelTester()
-              .mr(9)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_multipass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .kc(4);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_multipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .kc(4);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = 17;
-    for (size_t kc = 4; kc < 64; kc += 12) {
-      tester
-        .kc(kc)
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      tester
-        .kc(kc)
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_multipass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_multipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        for (size_t kc = 4; kc < 64; kc += 12) {
-          tester
-            .kc(kc)
-            .kh(ks)
-            .kw(1)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          tester
-            .kc(kc)
-            .kh(1)
-            .kw(ks)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_multipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .x_stride(131)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_multipass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_multipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        for (size_t kc = 1; kc < 4; kc++) {
-          tester
-            .kc(kc)
-            .kh(ks)
-            .kw(1)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          tester
-            .kc(kc)
-            .kh(1)
-            .kw(ks)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_multipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .x_stride(23)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_multipass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_multipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        for (size_t kc = 5; kc < 8; kc++) {
-          tester
-            .kc(kc)
-            .kh(ks)
-            .kw(1)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-          tester
-            .kc(kc)
-            .kh(1)
-            .kw(ks)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_multipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .x_stride(23)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_with_y_max) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(5)
-          .kw(5)
-          .kc(kc)
-          .qmax(128)
-          .iterations(3)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_with_y_min) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(5)
-          .kw(5)
-          .kc(kc)
-          .qmin(128)
-          .iterations(3)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(29)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n_with_y_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(31)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n_with_s) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t s = 2; s <= 5; s++) {
-          for (size_t kc = 8; kc < 25; kc += 5) {
-            ArgmaxPoolMicrokernelTester()
-              .mr(9)
-              .qr(8)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
+            ArgMaxPoolMicrokernelTester()
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(4)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
           }
         }
       }
@@ -1324,2061 +408,3144 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_eq_4_fulltile) {
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile) {
     TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(4)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(4)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(4)
+        .pooling_tile(4)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
 
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_eq_4_subtile) {
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_input_offset) {
     TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .kc(4);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
+            ArgMaxPoolMicrokernelTester()
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(4)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
           }
         }
       }
     }
   }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(4)
+    .pooling_tile(4)
+    .channels(1)
+    .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(4)
+    .pooling_tile(4)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(4)
+    .pooling_tile(4)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(4)
+    .pooling_tile(4)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(4)
+      .channels(1)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(4)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(channels)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(channels)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(4)
+      .pooling_tile(4)
+      .channels(channels)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(4)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
+}
 
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(4)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
+}
 
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_lt_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_lt_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_lt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_gt_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_gt_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_gt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(4)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_with_y_max) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .qmax(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_with_y_min) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .qmin(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n_with_y_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n_with_s) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t kc = 8; kc < 25; kc += 5) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(2)
-          .kw(2)
-          .kc(kc)
-          .s(2)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(1)
-          .kw(3)
-          .kc(kc)
-          .s(2)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        ArgmaxPoolMicrokernelTester()
-          .mr(4)
-          .n(n)
-          .kh(3)
-          .kw(1)
-          .kc(kc)
-          .s(2)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_eq_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_eq_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .kc(4);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_lt_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_lt_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_lt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_gt_4_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_gt_4_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_gt_4_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_with_y_max) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(3)
-          .kw(3)
-          .kc(kc)
-          .qmax(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_with_y_min) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(3)
-          .kw(3)
-          .kc(kc)
-          .qmin(128)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(29)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n_with_y_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(31)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n_with_s) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          for (size_t s = 2; s <= ks; s++) {
-            ArgmaxPoolMicrokernelTester()
-              .mr(9)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_multipass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .kc(4);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_multipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .kc(4);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = 17;
-    for (size_t kc = 4; kc < 64; kc += 12) {
-      tester
-        .kc(kc)
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kc(kc)
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_multipass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_multipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        for (size_t kc = 4; kc < 64; kc += 12) {
-          tester
-            .kc(kc)
-            .kh(ks)
-            .kw(1)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          tester
-            .kc(kc)
-            .kh(1)
-            .kw(ks)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_multipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 4; kc < 64; kc += 12) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .x_stride(131)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_multipass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_multipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        for (size_t kc = 1; kc < 4; kc++) {
-          tester
-            .kc(kc)
-            .kh(ks)
-            .kw(1)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          tester
-            .kc(kc)
-            .kh(1)
-            .kw(ks)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_multipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 1; kc < 4; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .x_stride(23)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    const size_t ks = tester.mr() + tester.qr();
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(23)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_multipass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_multipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-        for (size_t kc = 5; kc < 8; kc++) {
-          tester
-            .kc(kc)
-            .kh(ks)
-            .kw(1)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          tester
-            .kc(kc)
-            .kh(1)
-            .kw(ks)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_multipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t ks : std::vector<size_t>{{25, 49}}) {
-      auto tester = ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .iterations(3);
-      for (size_t kh = 1; kh <= ks; kh++) {
-        for (size_t kw = 1; kw <= ks; kw++) {
-          if (kh * kw == ks) {
-            for (size_t kc = 5; kc < 8; kc++) {
-              tester
-                .kh(kh)
-                .kw(kw)
-                .kc(kc)
-                .x_stride(23)
-                .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-            }
-          }
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_with_y_max) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(5)
-          .kw(5)
-          .kc(kc)
-          .qmax(128)
-          .iterations(3)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_with_y_min) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 1; n <= 5; n += 2) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(5)
-          .kw(5)
-          .kc(kc)
-          .qmin(128)
-          .iterations(3)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(29)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n_with_y_stride) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t kc = 8; kc < 25; kc += 5) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(31)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n_with_s) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{5, 7}}) {
-        for (size_t s = 2; s <= 5; s++) {
-          for (size_t kc = 8; kc < 25; kc += 5) {
-            ArgmaxPoolMicrokernelTester()
-              .mr(9)
-              .qr(8)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_eq_1_fulltile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(4)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_eq_1_subtile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(4)
-    .kc(1);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_input_offset) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .input_offset(7)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_qmin) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_qmax) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_output_stride) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(4)
+          .channels(channels)
+          .output_stride(7)
+          .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_step) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(4)
+            .step(step)
+            .channels(channels)
+            .output_stride(7)
+            .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_gt_1_fulltile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(4);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 16; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_gt_1_subtile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(4)
-    .iterations(3);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 2; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
+            ArgMaxPoolMicrokernelTester()
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
           }
         }
       }
     }
   }
-}
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_gt_1_fulltile_with_x_stride) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(4)
-    .iterations(3);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 16; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(131)
-            .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, y_max) {
-  for (size_t n = 1; n <= 5; n += 2) {
-    for (size_t kc = 1; kc < 16; kc++) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(2)
-        .kw(2)
-        .kc(kc)
-        .qmax(128)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, y_min) {
-  for (size_t n = 1; n <= 5; n += 2) {
-    for (size_t kc = 1; kc < 16; kc++) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(2)
-        .kw(2)
-        .kc(kc)
-        .qmin(128)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t kc = 1; kc < 15; kc += 3) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(2)
-        .kw(2)
-        .kc(kc)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(1)
-        .kw(3)
-        .kc(kc)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(3)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n_with_x_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t kc = 1; kc < 15; kc += 3) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(2)
-        .kw(2)
-        .kc(kc)
-        .x_stride(29)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(1)
-        .kw(3)
-        .kc(kc)
-        .x_stride(29)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(3)
-        .kw(1)
-        .kc(kc)
-        .x_stride(29)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n_with_y_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t kc = 1; kc < 15; kc += 3) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(2)
-        .kw(2)
-        .kc(kc)
-        .y_stride(31)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(1)
-        .kw(3)
-        .kc(kc)
-        .y_stride(31)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(3)
-        .kw(1)
-        .kc(kc)
-        .y_stride(31)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n_with_s) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t kc = 1; kc < 15; kc += 3) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(2)
-        .kw(2)
-        .kc(kc)
-        .s(2)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(1)
-        .kw(3)
-        .kc(kc)
-        .s(2)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      ArgmaxPoolMicrokernelTester()
-        .mr(4)
-        .n(n)
-        .kh(3)
-        .kw(1)
-        .kc(kc)
-        .s(2)
-        .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_eq_1_fulltile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_eq_1_subtile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .kc(1);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_gt_1_fulltile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 16; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_gt_1_subtile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .iterations(3);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 2; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
+            ArgMaxPoolMicrokernelTester()
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
           }
         }
       }
     }
   }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9)
+    .channels(1)
+    .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_gt_1_fulltile_with_x_stride) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .iterations(3);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 16; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(131)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9)
+      .channels(1)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(channels)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(channels)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9)
+      .channels(channels)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_input_offset) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .input_offset(7)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_qmin) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_qmax) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_output_stride) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9)
+          .channels(channels)
+          .output_stride(7)
+          .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_step) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9)
+            .step(step)
+            .channels(channels)
+            .output_stride(7)
+            .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, y_max) {
-  for (size_t n = 1; n <= 5; n += 2) {
-    for (size_t kc = 1; kc < 16; kc++) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .n(n)
-        .kh(3)
-        .kw(3)
-        .kc(kc)
-        .qmax(128)
-        .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, y_min) {
-  for (size_t n = 1; n <= 5; n += 2) {
-    for (size_t kc = 1; kc < 16; kc++) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .n(n)
-        .kh(3)
-        .kw(3)
-        .kc(kc)
-        .qmin(128)
-        .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(23)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n_with_x_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n_with_y_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n_with_s) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        for (size_t s = 2; s <= ks; s++) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .s(s)
-            .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(4)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_twopass_fulltile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  const size_t ks = tester.mr() + tester.qr();
-  for (size_t kh = 1; kh <= ks; kh++) {
-    for (size_t kw = 1; kw <= ks; kw++) {
-      if (kh * kw == ks) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_twopass_subtile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_multipass_fulltile) {
-  for (size_t ks : std::vector<size_t>{{25, 49}}) {
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(1);
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_multipass_subtile) {
-  for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(1);
-    for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_twopass_fulltile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  const size_t ks = 17;
-  for (size_t kc = 2; kc < 16; kc++) {
-    tester
-      .kc(kc)
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kc(kc)
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_twopass_subtile) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
-    for (size_t kc = 2; kc < 16; kc++) {
-      tester
-        .kc(kc)
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kc(kc)
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_x_stride) {
-  auto tester = ArgmaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  const size_t ks = tester.mr() + tester.qr();
-  for (size_t kh = 1; kh <= ks; kh++) {
-    for (size_t kw = 1; kw <= ks; kw++) {
-      if (kh * kw == ks) {
-        for (size_t kc = 2; kc < 16; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(131)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
         }
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_multipass_fulltile) {
-  for (size_t ks : std::vector<size_t>{{25, 49}}) {
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 2; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
+            ArgMaxPoolMicrokernelTester()
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9, 8)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
           }
         }
       }
     }
   }
-}
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_multipass_subtile) {
-  for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
-      for (size_t kc = 2; kc < 16; kc++) {
-        tester
-          .kc(kc)
-          .kh(ks)
-          .kw(1)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kc(kc)
-          .kh(1)
-          .kw(ks)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(23)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
-}
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_multipass_fulltile_with_x_stride) {
-  for (size_t ks : std::vector<size_t>{{25, 49}}) {
-    auto tester = ArgmaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= ks; kh++) {
-      for (size_t kw = 1; kw <= ks; kw++) {
-        if (kh * kw == ks) {
-          for (size_t kc = 2; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(131)
-              .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(4)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
+            ArgMaxPoolMicrokernelTester()
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9, 8)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
           }
         }
       }
     }
   }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, y_max) {
-  for (size_t n = 1; n <= 5; n += 2) {
-    for (size_t kc = 1; kc < 15; kc += 3) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .n(n)
-        .kh(5)
-        .kw(5)
-        .kc(kc)
-        .qmax(128)
-        .iterations(3)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_input_offset) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmin) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmax) {
+  ArgMaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, y_min) {
-  for (size_t n = 1; n <= 5; n += 2) {
-    for (size_t kc = 1; kc < 15; kc += 3) {
-      ArgmaxPoolMicrokernelTester()
-        .mr(9)
-        .qr(8)
-        .n(n)
-        .kh(5)
-        .kw(5)
-        .kc(kc)
-        .qmin(128)
-        .iterations(3)
-        .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{5, 7}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_input_offset) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmin) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .qmin(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmax) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .qmax(192)
+      .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_input_offset) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmin) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmax) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n_with_x_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{5, 7}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(29)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_input_offset) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(7)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n_with_y_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{5, 7}}) {
-      for (size_t kc = 1; kc < 15; kc += 3) {
-        ArgmaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .y_stride(31)
-          .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmin) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n_with_s) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{5, 7}}) {
-      for (size_t s = 2; s <= 5; s++) {
-        for (size_t kc = 1; kc < 15; kc += 3) {
-          ArgmaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .s(s)
-            .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmax) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_output_stride) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .output_stride(7)
+          .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_step) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .step(step)
+            .channels(channels)
+            .output_stride(7)
+            .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/test/f32-argmaxpool.yaml b/test/f32-argmaxpool.yaml
new file mode 100644
index 0000000..7903bc7
--- /dev/null
+++ b/test/f32-argmaxpool.yaml
@@ -0,0 +1,13 @@
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_f32_argmaxpool_ukernel_4x__sse2_c4
+- name: xnn_f32_argmaxpool_ukernel_4x__psimd_c4
+- name: xnn_f32_argmaxpool_ukernel_4x__scalar_c1
+- name: xnn_f32_argmaxpool_ukernel_9x__sse2_c4
+- name: xnn_f32_argmaxpool_ukernel_9x__psimd_c4
+- name: xnn_f32_argmaxpool_ukernel_9x__scalar_c1
+- name: xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4
+- name: xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4
+- name: xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1
diff --git a/test/f32-maxpool.cc b/test/f32-maxpool.cc
index 159a907..a8e453e 100644
--- a/test/f32-maxpool.cc
+++ b/test/f32-maxpool.cc
@@ -1,7 +1,15 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
 // Copyright 2019 Google LLC
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/f32-maxpool.yaml
+//   Generator: tools/generate-maxpool-test.py
+
 
 #include <gtest/gtest.h>
 
@@ -13,1202 +21,884 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        }
-      }
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile_with_qmin) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        }
-      }
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        }
-      }
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_subtile) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile_with_input_offset) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile_with_qmin) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_multipass) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_multipass_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_multipass_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile_with_qmax) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_subtile) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass_with_qmin) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(23)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(4)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass_with_qmax) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass_with_x_stride) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass_with_input_offset) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass_with_qmin) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass_with_qmin) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass_with_qmax) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass_with_x_stride) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels) {
     TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass_with_qmin) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass_with_qmax) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass_with_x_stride) {
-    TEST_REQUIRES_X86_SSE;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__SSE, small_n) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .iterations(3)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
         }
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, small_n_with_x_stride) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_input_offset) {
     TEST_REQUIRES_X86_SSE;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(101)
-            .iterations(1)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
         }
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, small_n_with_y_stride) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_qmin) {
     TEST_REQUIRES_X86_SSE;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(103)
-            .iterations(1)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
         }
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__SSE, small_n_with_s) {
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_qmax) {
     TEST_REQUIRES_X86_SSE;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
-          for (size_t s = 2; s <= ks; s++) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+        }
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
             MaxPoolMicrokernelTester()
-              .mr(9)
-              .qr(8)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .iterations(1)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9, 8)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
           }
         }
       }
@@ -1217,1768 +907,1329 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
-#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile) {
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile_with_qmin) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_subtile) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile_with_input_offset) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmin) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 4; kc < 64; kc += 12) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 4; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 5; kc < 8; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_subtile) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_multipass) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_multipass_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_multipass_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmax) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(4);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_subtile) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass_with_qmin) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .input_offset(7)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmin(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(4)
+      .qmax(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(23)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 8; channels < 32; channels += 4) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(5)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels < 4; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(5)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(11)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 5; channels < 8; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .input_offset(7)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmin(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(4)
+        .qmax(192)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 8; channels < 32; channels += 4) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_input_offset) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(4)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmax) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 4; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass_with_x_stride) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 4; kc < 64; kc += 12) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_input_offset) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(11)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass_with_qmin) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmin) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass_with_qmax) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmax) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 5; channels < 8; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass_with_x_stride) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels) {
     TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 4; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass_with_qmin) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass_with_qmax) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass_with_x_stride) {
-    TEST_REQUIRES_PSIMD;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 5; kc < 8; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-
-  TEST(SMAXPOOL_9P8Q__PSIMD, small_n) {
-    TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .iterations(3)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, small_n_with_x_stride) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_input_offset) {
     TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(101)
-            .iterations(1)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .input_offset(23)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, small_n_with_y_stride) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmin) {
     TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(103)
-            .iterations(1)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
 
-  TEST(SMAXPOOL_9P8Q__PSIMD, small_n_with_s) {
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmax) {
     TEST_REQUIRES_PSIMD;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
-          for (size_t s = 2; s <= ks; s++) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .output_stride(23)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+        }
+      }
+    }
+  }
+
+  TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 20; channels += 3) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
             MaxPoolMicrokernelTester()
-              .mr(9)
-              .qr(8)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .iterations(1)
-              .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9, 8)
+              .step(step)
+              .channels(channels)
+              .output_stride(23)
+              .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
           }
         }
       }
     }
   }
-#endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmin(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmax(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_x_stride) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(257)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    for (size_t kc = 2; kc < 5; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmin(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmax(192)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmin(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmax(192)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_x_stride) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 5; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(257)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-    for (size_t kc = 2; kc < 5; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    tester
-      .kh(ks)
-      .kw(1)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
       .qmin(192)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .qmax(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_input_offset) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmin) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmax) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
       .qmin(192)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    tester
-      .kh(ks)
-      .kw(1)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
       .qmax(192)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_input_offset) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmin) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .qmin(192)
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmax) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
       .qmax(192)
-      .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 5; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 5; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_input_offset) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmin) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .qmin(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 5; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmax) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .qmax(192)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_x_stride) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 5; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .x_stride(257)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .x_stride(257)
-        .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-      for (size_t kc = 1; kc < 5; kc++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
         MaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .iterations(3)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n_with_x_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-      for (size_t kc = 1; kc < 5; kc++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_input_offset) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
         MaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(101)
-          .iterations(1)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(7)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n_with_y_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-      for (size_t kc = 1; kc < 5; kc++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmin) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
         MaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .y_stride(103)
-          .iterations(1)
-          .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n_with_s) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
-      for (size_t kc = 1; kc < 5; kc++) {
-        for (size_t s = 2; s <= ks; s++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmax) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_output_stride) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .output_stride(7)
+          .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_step) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .s(s)
-            .iterations(1)
-            .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .step(step)
+            .channels(channels)
+            .output_stride(7)
+            .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/test/f32-maxpool.yaml b/test/f32-maxpool.yaml
new file mode 100644
index 0000000..e6db8b5
--- /dev/null
+++ b/test/f32-maxpool.yaml
@@ -0,0 +1,7 @@
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_f32_maxpool_ukernel_9p8x__sse_c4
+- name: xnn_f32_maxpool_ukernel_9p8x__psimd_c4
+- name: xnn_f32_maxpool_ukernel_9p8x__scalar_c1
diff --git a/test/maxpool-microkernel-tester.h b/test/maxpool-microkernel-tester.h
index 94132ed..68d57d5 100644
--- a/test/maxpool-microkernel-tester.h
+++ b/test/maxpool-microkernel-tester.h
@@ -30,115 +30,103 @@
     Scalar,
   };
 
-  inline MaxPoolMicrokernelTester& n(size_t n) {
-    assert(n != 0);
-    this->n_ = n;
+  inline MaxPoolMicrokernelTester& output_pixels(size_t output_pixels) {
+    assert(output_pixels != 0);
+    this->output_pixels_ = output_pixels;
     return *this;
   }
 
-  inline size_t n() const {
-    return this->n_;
+  inline size_t output_pixels() const {
+    return this->output_pixels_;
   }
 
-  inline MaxPoolMicrokernelTester& s(size_t s) {
-    assert(s != 0);
-    this->s_ = s;
+  inline MaxPoolMicrokernelTester& step(size_t step) {
+    assert(step != 0);
+    this->step_ = step;
     return *this;
   }
 
-  inline size_t s() const {
-    return this->s_;
+  inline size_t step() const {
+    return this->step_;
   }
 
-  inline MaxPoolMicrokernelTester& kh(size_t kh) {
-    assert(kh != 0);
-    this->kh_ = kh;
+  inline MaxPoolMicrokernelTester& input_offset(size_t input_offset) {
+    assert(input_offset != 0);
+    this->input_offset_ = input_offset;
     return *this;
   }
 
-  inline size_t kh() const {
-    return this->kh_;
+  inline size_t input_offset() const {
+    return this->input_offset_;
   }
 
-  inline MaxPoolMicrokernelTester& kw(size_t kw) {
-    assert(kw != 0);
-    this->kw_ = kw;
+  inline MaxPoolMicrokernelTester& pooling_elements(size_t pooling_elements) {
+    assert(pooling_elements != 0);
+    this->pooling_elements_ = pooling_elements;
     return *this;
   }
 
-  inline size_t kw() const {
-    return this->kw_;
+  inline size_t pooling_elements() const {
+    return this->pooling_elements_;
   }
 
-  inline size_t ks() const {
-    return kh() * kw();
-  }
-
-  inline size_t packed_ks() const {
-    if (ks() <= mr()) {
-      return mr();
+  inline size_t packed_pooling_elements() const {
+    if (pooling_elements() <= primary_pooling_tile()) {
+      return primary_pooling_tile();
     } else {
-      return (ks() - mr()) % qr() == 0 ? ks() : ((ks() - mr()) / qr() + 1) * qr() + mr();
+      return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile();
     }
   }
 
-  inline MaxPoolMicrokernelTester& mr(size_t mr) {
-    assert(mr != 0);
-    this->mr_ = mr;
+  inline MaxPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile) {
+    assert(primary_tile != 0);
+    this->primary_pooling_tile_ = primary_tile;
+    this->incremental_pooling_tile_ = incremental_tile;
     return *this;
   }
 
-  inline size_t mr() const {
-    return this->mr_;
-  }
-
-  inline MaxPoolMicrokernelTester& qr(size_t qr) {
-    assert(qr != 0);
-    this->qr_ = qr;
+  inline MaxPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) {
+    assert(primary_pooling_tile != 0);
+    this->primary_pooling_tile_ = primary_pooling_tile;
     return *this;
   }
 
-  inline size_t qr() const {
-    return this->qr_;
+  inline size_t primary_pooling_tile() const {
+    return this->primary_pooling_tile_;
   }
 
-  inline MaxPoolMicrokernelTester& kc(size_t kc) {
-    assert(kc != 0);
-    this->kc_ = kc;
+  inline MaxPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) {
+    assert(incremental_pooling_tile != 0);
+    this->incremental_pooling_tile_ = incremental_pooling_tile;
     return *this;
   }
 
-  inline size_t kc() const {
-    return this->kc_;
+  inline size_t incremental_pooling_tile() const {
+    return this->incremental_pooling_tile_;
   }
 
-  inline MaxPoolMicrokernelTester& x_stride(size_t x_stride) {
-    assert(x_stride != 0);
-    this->x_stride_ = x_stride;
+  inline MaxPoolMicrokernelTester& channels(size_t channels) {
+    assert(channels != 0);
+    this->channels_ = channels;
     return *this;
   }
 
-  inline size_t x_stride() const {
-    if (this->x_stride_ == 0) {
-      return kc();
+  inline size_t channels() const {
+    return this->channels_;
+  }
+
+  inline MaxPoolMicrokernelTester& output_stride(size_t output_stride) {
+    assert(output_stride != 0);
+    this->output_stride_ = output_stride;
+    return *this;
+  }
+
+  inline size_t output_stride() const {
+    if (this->output_stride_ == 0) {
+      return channels();
     } else {
-      assert(this->x_stride_ >= kc());
-      return this->x_stride_;
-    }
-  }
-
-  inline MaxPoolMicrokernelTester& y_stride(size_t y_stride) {
-    assert(y_stride != 0);
-    this->y_stride_ = y_stride;
-    return *this;
-  }
-
-  inline size_t y_stride() const {
-    if (this->y_stride_ == 0) {
-      return kc();
-    } else {
-      assert(this->y_stride_ >= kc());
-      return this->y_stride_;
+      assert(this->output_stride_ >= channels());
+      return this->output_stride_;
     }
   }
 
@@ -174,19 +162,23 @@
     auto rng = std::mt19937(random_device());
     auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
 
-    std::vector<const uint8_t*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
-    std::vector<uint8_t> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(uint8_t));
-
-    std::vector<uint8_t> y((n() - 1) * y_stride() + kc() + XNN_EXTRA_BYTES / sizeof(uint8_t));
-    std::vector<uint8_t> y_ref(n() * kc());
+    std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+    std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
+      indirect_input.size() * channels());
+    std::vector<uint8_t> output(XNN_EXTRA_BYTES / sizeof(uint8_t) +
+      (output_pixels() - 1) * output_stride() + channels());
+    std::vector<uint8_t> output_ref(output_pixels() * channels());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), std::ref(u8rng));
-      std::fill(y.begin(), y.end(), 0xA5);
+      do {
+        std::generate(input.begin(), input.end(), std::ref(u8rng));
+      } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
+      std::fill(output.begin(), output.end(), 0xA5);
 
-      for (size_t i = 0; i < indirect_x.size(); i++) {
-        indirect_x[i] = x.data() + i * x_stride();
+      for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+        indirect_input[i] = input.data() + i * channels() - input_offset();
       }
-      std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+      std::shuffle(indirect_input.begin(),
+        indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
 
       // Prepare output parameters.
       xnn_u8_output_params output_params = { };
@@ -200,32 +192,40 @@
       }
 
       // Compute reference results.
-      for (size_t i = 0; i < n(); i++) {
-        for (size_t k = 0; k < kc(); k++) {
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
           uint8_t max_value = 0;
-          for (size_t j = 0; j < ks(); j++) {
-            max_value = std::max(max_value,
-              indirect_x[i * s() * kh() + j][k]);
+          for (size_t p = 0; p < pooling_elements(); p++) {
+            max_value = std::max(max_value, indirect_input[x * step() + p][c + input_offset()]);
           }
           max_value = std::min(max_value, qmax());
           max_value = std::max(max_value, qmin());
-          y_ref[i * kc() + k] = max_value;
+          output_ref[x * channels() + c] = max_value;
         }
       }
 
       // Call optimized micro-kernel.
-      maxpool(n(), ks(), kc(),
-        indirect_x.data(), y.data(),
-        (kh() * s() - packed_ks()) * sizeof(void*),
-        (y_stride() - kc()) * sizeof(uint8_t),
+      maxpool(output_pixels(), pooling_elements(), channels(),
+        indirect_input.data(), input_offset() * sizeof(uint8_t), output.data(),
+        (step() - packed_pooling_elements()) * sizeof(void*),
+        (output_stride() - channels()) * sizeof(uint8_t),
         &output_params);
 
       // Verify results.
-      for (size_t i = 0; i < n(); i++) {
-        for (size_t k = 0; k < kc(); k++) {
-          ASSERT_EQ(uint32_t(y_ref[i * kc() + k]), uint32_t(y[i * y_stride() + k]))
-            << "at pixel " << i << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c]))
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
         }
       }
     }
@@ -236,87 +236,94 @@
     auto rng = std::mt19937(random_device());
     auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
 
-    std::vector<const float*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
-    std::vector<float> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-
-    std::vector<float> y((n() - 1) * y_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-    std::vector<float> y_ref(n() * kc());
+    std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+    std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+      ((output_pixels() - 1) * step() + pooling_elements()) * channels());
+    std::vector<float> output(XNN_EXTRA_BYTES / sizeof(float) +
+      (output_pixels() - 1) * output_stride() + channels());
+    std::vector<float> output_ref(output_pixels() * channels());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(x.begin(), x.end(), std::ref(f32rng));
-      std::fill(y.begin(), y.end(), nanf(""));
+      std::generate(input.begin(), input.end(), std::ref(f32rng));
+      std::fill(output.begin(), output.end(), nanf(""));
 
-      for (size_t i = 0; i < indirect_x.size(); i++) {
-        indirect_x[i] = x.data() + i * x_stride();
+      for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+        indirect_input[i] = input.data() + i * channels() - input_offset();
       }
-      std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+      std::shuffle(indirect_input.begin(),
+        indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
 
       // Compute reference results, without clamping.
-      for (size_t i = 0; i < n(); i++) {
-        for (size_t k = 0; k < kc(); k++) {
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
           float max_value = -std::numeric_limits<float>::infinity();
-          for (size_t j = 0; j < ks(); j++) {
-            max_value = std::max(max_value,
-              indirect_x[i * s() * kh() + j][k]);
+          for (size_t p = 0; p < pooling_elements(); p++) {
+            max_value = std::max(max_value, indirect_input[x * step() + p][c + input_offset()]);
           }
-          y_ref[i * kc() + k] = max_value;
+          output_ref[x * channels() + c] = max_value;
         }
       }
 
       // Compute clamping parameters.
-      const float accumulated_min = *std::min_element(y_ref.cbegin(), y_ref.cend());
-      const float accumulated_max = *std::max_element(y_ref.cbegin(), y_ref.cend());
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
       const float accumulated_range = accumulated_max - accumulated_min;
-      const float y_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
-      const float y_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
+      const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
+      const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
 
 
       // Prepare output parameters.
       xnn_f32_output_params output_params = { };
       switch (variant) {
         case Variant::Native:
-          output_params = xnn_init_f32_output_params(y_min, y_max);
+          output_params = xnn_init_f32_output_params(output_min, output_max);
           break;
         case Variant::Scalar:
-          output_params = xnn_init_scalar_f32_output_params(y_min, y_max);
+          output_params = xnn_init_scalar_f32_output_params(output_min, output_max);
           break;
       }
 
       // Clamp reference results.
-      for (size_t i = 0; i < n(); i++) {
-        for (size_t k = 0; k < kc(); k++) {
-          y_ref[i * kc() + k] = std::max(std::min(y_ref[i * kc() + k], y_max), y_min);
-        }
+      for (float& output_value : output_ref) {
+        output_value = std::max(std::min(output_value, output_max), output_min);
       }
 
       // Call optimized micro-kernel.
-      maxpool(n(), ks(), kc(),
-        indirect_x.data(), y.data(),
-        (kh() * s() - packed_ks()) * sizeof(void*),
-        (y_stride() - kc()) * sizeof(float),
+      maxpool(output_pixels(), pooling_elements(), channels(),
+        indirect_input.data(), input_offset() * sizeof(float), output.data(),
+        (step() - packed_pooling_elements()) * sizeof(void*),
+        (output_stride() - channels()) * sizeof(float),
         &output_params);
 
       // Verify results.
-      for (size_t i = 0; i < n(); i++) {
-        for (size_t k = 0; k < kc(); k++) {
-          ASSERT_EQ(y_ref[i * kc() + k], y[i * y_stride() + k])
-            << "at pixel " << i << ", channel " << k << ", n = " << n()
-            << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+      for (size_t x = 0; x < output_pixels(); x++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_GE(output[x * output_stride() + c], output_min)
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_LE(output[x * output_stride() + c], output_max)
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
+          ASSERT_EQ(output_ref[x * channels() + c], output[x * output_stride() + c])
+            << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+            << ", pooling elements = " << pooling_elements() << ", step = " << step()
+            << ", input offset = " << input_offset();
         }
       }
     }
   }
 
  private:
-  size_t n_{1};
-  size_t s_{1};
-  size_t kh_{1};
-  size_t kw_{1};
-  size_t mr_{1};
-  size_t qr_{1};
-  size_t kc_{1};
-  size_t x_stride_{0};
-  size_t y_stride_{0};
+  size_t output_pixels_{1};
+  size_t pooling_elements_{1};
+  size_t channels_{1};
+  size_t input_offset_{0};
+  size_t step_{1};
+  size_t primary_pooling_tile_{1};
+  size_t incremental_pooling_tile_{1};
+  size_t output_stride_{0};
   uint8_t qmin_{0};
   uint8_t qmax_{255};
-  size_t iterations_{15};
+  size_t iterations_{3};
 };
diff --git a/test/u8-maxpool.cc b/test/u8-maxpool.cc
index b5832dc..a089944 100644
--- a/test/u8-maxpool.cc
+++ b/test/u8-maxpool.cc
@@ -5,6 +5,11 @@
 //
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: test/u8-maxpool.yaml
+//   Generator: tools/generate-maxpool-test.py
+
 
 #include <gtest/gtest.h>
 
@@ -16,1202 +21,884 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        }
-      }
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .input_offset(19)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmin(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmax(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile_with_qmin) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        }
-      }
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .input_offset(19)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        }
-      }
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_subtile) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile_with_input_offset) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(131)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile_with_qmin) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_fulltile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_subtile) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_multipass) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_multipass_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(131)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(17)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_multipass_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile_with_qmax) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(17)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_subtile) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass_with_qmin) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .input_offset(19)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmin(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmax(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .input_offset(19)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(83)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(131)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(17)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(17)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .input_offset(19)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(131)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass_with_input_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(16)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass_with_qmax) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass_with_x_stride) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass_with_input_offset) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass_with_qmin) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass_with_qmin) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass_with_qmax) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass_with_x_stride) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels) {
     TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass_with_qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass_with_qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass_with_x_stride) {
-    TEST_REQUIRES_ARM_NEON;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__NEON, small_n) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .iterations(3)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
         }
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, small_n_with_x_stride) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_input_offset) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(101)
-            .iterations(1)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .input_offset(83)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
         }
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, small_n_with_y_stride) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_qmin) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(103)
-            .iterations(1)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
         }
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__NEON, small_n_with_s) {
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_qmax) {
     TEST_REQUIRES_ARM_NEON;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
-          for (size_t s = 2; s <= ks; s++) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+        }
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .output_stride(83)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+        }
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_step) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
             MaxPoolMicrokernelTester()
-              .mr(9)
-              .qr(8)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .iterations(1)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9, 8)
+              .step(step)
+              .channels(channels)
+              .output_stride(83)
+              .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
           }
         }
       }
@@ -1219,1203 +906,886 @@
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        }
-      }
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .input_offset(19)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmin(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmax(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile_with_qmin) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        }
-      }
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .input_offset(19)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        }
-      }
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_subtile) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile_with_input_offset) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(131)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile_with_qmin) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr(); kw++) {
-        if (kh * kw == tester.mr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = 2; ks < tester.mr(); ks++) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmin(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .qmax(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 16; kc < 256; kc += 48) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 1; kc < 16; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmin(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .qmax(192)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-      for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-        if (kh * kw == tester.mr() + tester.qr()) {
-          for (size_t kc = 17; kc < 32; kc++) {
-            tester
-              .kh(kh)
-              .kw(kw)
-              .kc(kc)
-              .x_stride(257)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-          }
-        }
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_subtile) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_multipass) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      tester
-        .kh(1)
-        .kw(ks)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_multipass_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(131)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(17)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_multipass_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile_with_qmax) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .kc(16);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      tester
-        .kh(ks)
-        .kw(1)
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      tester
-        .kh(1)
-        .kw(ks)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(17)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(9)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_subtile) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass_with_qmin) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_subtile_with_input_offset) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .input_offset(19)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmin(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(16)
+      .qmax(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .input_offset(19)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(83)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 32; channels < 128; channels += 16) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(131)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(17)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 1; channels < 16; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(17)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(37)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t channels = 17; channels < 32; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_subtile) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_subtile_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .input_offset(19)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .qmin(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(16)
+        .qmax(192)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(131)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass_with_qmax) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 32; channels < 128; channels += 16) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass_with_input_offset) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(16)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass_with_qmin) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass_with_qmax) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 1; channels < 16; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass_with_x_stride) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 16; kc < 256; kc += 48) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass_with_input_offset) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(37)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass_with_qmin) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass_with_qmin) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass_with_qmax) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass_with_qmax) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
+    for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+      for (size_t channels = 17; channels < 32; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(17)
+          .pooling_tile(9, 8)
+          .channels(channels)
           .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+          .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass_with_x_stride) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels) {
     TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 1; kc < 16; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass_with_qmin) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass_with_qmax) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass_with_x_stride) {
-    TEST_REQUIRES_X86_SSE2;
-    auto tester = MaxPoolMicrokernelTester()
-      .mr(9)
-      .qr(8)
-      .iterations(3);
-    for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-      for (size_t kc = 17; kc < 32; kc++) {
-        tester
-          .kh(ks)
-          .kw(1)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-        tester
-          .kh(1)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(257)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
-      }
-    }
-  }
-
-  TEST(U8_MAXPOOL_9P8Q__SSE2, small_n) {
-    TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .iterations(3)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
         }
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, small_n_with_x_stride) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_input_offset) {
     TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .x_stride(101)
-            .iterations(1)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .input_offset(83)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
         }
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, small_n_with_y_stride) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_qmin) {
     TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .y_stride(103)
-            .iterations(1)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmin(192)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
         }
       }
     }
   }
 
-  TEST(U8_MAXPOOL_9P8Q__SSE2, small_n_with_s) {
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_qmax) {
     TEST_REQUIRES_X86_SSE2;
-    for (size_t n = 2; n < 5; n++) {
-      for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
-        for (size_t kc = 1; kc < 51; kc += 5) {
-          for (size_t s = 2; s <= ks; s++) {
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .qmax(192)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+        }
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_output_stride) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .channels(channels)
+            .output_stride(83)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+        }
+      }
+    }
+  }
+
+  TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_step) {
+    TEST_REQUIRES_X86_SSE2;
+    for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+      for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+        for (size_t channels = 1; channels <= 80; channels += 15) {
+          for (size_t step = 2; step <= pooling_elements; step++) {
             MaxPoolMicrokernelTester()
-              .mr(9)
-              .qr(8)
-              .n(n)
-              .kh(ks)
-              .kw(ks)
-              .kc(kc)
-              .s(s)
-              .iterations(1)
-              .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+              .output_pixels(output_pixels)
+              .pooling_elements(pooling_elements)
+              .pooling_tile(9, 8)
+              .step(step)
+              .channels(channels)
+              .output_stride(83)
+              .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
           }
         }
       }
@@ -2423,566 +1793,443 @@
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(9)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmin(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmax(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_x_stride) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t kh = 1; kh <= tester.mr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr(); kw++) {
-      if (kh * kw == tester.mr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(257)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = 2; ks < tester.mr(); ks++) {
-    for (size_t kc = 2; kc < 8; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmin(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        tester
-          .kh(kh)
-          .kw(kw)
-          .qmax(192)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmin(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .qmax(192)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_x_stride) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
-    for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
-      if (kh * kw == tester.mr() + tester.qr()) {
-        for (size_t kc = 2; kc < 8; kc++) {
-          tester
-            .kh(kh)
-            .kw(kw)
-            .kc(kc)
-            .x_stride(257)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-        }
-      }
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_subtile) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
-    for (size_t kc = 2; kc < 8; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    tester
-      .kh(ks)
-      .kw(1)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    tester
-      .kh(ks)
-      .kw(1)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
       .qmin(192)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(9)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .qmax(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_input_offset) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .input_offset(3)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmin) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmin(192)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmax) {
+  MaxPoolMicrokernelTester()
+    .pooling_elements(17)
+    .pooling_tile(9, 8)
+    .channels(1)
+    .qmax(192)
+    .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_input_offset) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
+      .input_offset(3)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmin) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
       .qmin(192)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .kc(1);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    tester
-      .kh(ks)
-      .kw(1)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmax) {
+  for (size_t channels = 2; channels < 10; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(channels)
       .qmax(192)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    tester
-      .kh(1)
-      .kw(ks)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile_with_input_offset) {
+  for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_input_offset) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .input_offset(3)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmin) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
+      .qmin(192)
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmax) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(17)
+      .pooling_tile(9, 8)
+      .channels(1)
       .qmax(192)
-      .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+      .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 8; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmin) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 8; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_input_offset) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
+        .input_offset(3)
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmin) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmin(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .qmin(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmax) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 8; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmax) {
+  for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+    for (size_t channels = 2; channels < 10; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(17)
+        .pooling_tile(9, 8)
+        .channels(channels)
         .qmax(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .qmax(192)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+        .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_x_stride) {
-  auto tester = MaxPoolMicrokernelTester()
-    .mr(9)
-    .qr(8)
-    .iterations(3);
-  for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
-    for (size_t kc = 2; kc < 8; kc++) {
-      tester
-        .kh(ks)
-        .kw(1)
-        .kc(kc)
-        .x_stride(257)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-      tester
-        .kh(1)
-        .kw(ks)
-        .kc(kc)
-        .x_stride(257)
-        .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-      for (size_t kc = 1; kc < 16; kc += 5) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
         MaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .iterations(3)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n_with_x_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-      for (size_t kc = 1; kc < 16; kc += 5) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_input_offset) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
         MaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .x_stride(101)
-          .iterations(1)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .input_offset(7)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n_with_y_stride) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
-      for (size_t kc = 1; kc < 16; kc += 5) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmin) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
         MaxPoolMicrokernelTester()
-          .mr(9)
-          .qr(8)
-          .n(n)
-          .kh(ks)
-          .kw(ks)
-          .kc(kc)
-          .y_stride(103)
-          .iterations(1)
-          .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmin(192)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n_with_s) {
-  for (size_t n = 2; n < 5; n++) {
-    for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
-      for (size_t kc = 1; kc < 16; kc += 5) {
-        for (size_t s = 2; s <= ks; s++) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmax) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .qmax(192)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_output_stride) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(9, 8)
+          .channels(channels)
+          .output_stride(7)
+          .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_step) {
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+      for (size_t channels = 1; channels <= 5; channels += 1) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
           MaxPoolMicrokernelTester()
-            .mr(9)
-            .qr(8)
-            .n(n)
-            .kh(ks)
-            .kw(ks)
-            .kc(kc)
-            .s(s)
-            .iterations(1)
-            .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(9, 8)
+            .step(step)
+            .channels(channels)
+            .output_stride(7)
+            .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
         }
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/test/u8-maxpool.yaml b/test/u8-maxpool.yaml
new file mode 100644
index 0000000..d9c894b
--- /dev/null
+++ b/test/u8-maxpool.yaml
@@ -0,0 +1,7 @@
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_u8_maxpool_ukernel_9p8x__neon_c16
+- name: xnn_u8_maxpool_ukernel_9p8x__sse2_c16
+- name: xnn_u8_maxpool_ukernel_9p8x__scalar_c1
diff --git a/tools/generate-argmaxpool-test.py b/tools/generate-argmaxpool-test.py
new file mode 100755
index 0000000..3160b27
--- /dev/null
+++ b/tools/generate-argmaxpool-test.py
@@ -0,0 +1,1095 @@
+#!/usr/bin/env python
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import codecs
+import math
+import os
+import re
+import sys
+import yaml
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from primes import next_prime
+import xngen
+import xnncommon
+
+
+parser = argparse.ArgumentParser(
+  description='ArgMaxPool microkernel test generator')
+parser.add_argument("-s", "--spec", metavar="FILE", required=True,
+                    help="Specification (YAML) file")
+parser.add_argument("-o", "--output", metavar="FILE", required=True,
+                    help='Output (C++ source) file')
+parser.set_defaults(defines=list())
+
+
+def split_ukernel_name(name):
+  match = re.match(r"^xnn_(f16|f32)_argmaxpool_ukernel_((\d+)p)?(\d+)x__(.+)_c(\d+)$", name)
+  if match is None:
+    raise ValueError("Unexpected microkernel name: " + name)
+
+  if match.group(2):
+    primary_tile = int(match.group(3))
+    incremental_tile = int(match.group(4))
+  else:
+    primary_tile = int(match.group(4))
+    incremental_tile = 0
+
+  channel_tile = int(match.group(6))
+
+  arch, isa = xnncommon.parse_target_name(target_name=match.group(5))
+  return primary_tile, incremental_tile, channel_tile, arch, isa
+
+
+ARGMAXPOOL_TEST_TEMPLATE = """\
+$if INCREMENTAL_TILE == 0:
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .input_offset(${next_prime(CHANNEL_TILE+1)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .qmin(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .qmax(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .input_offset(${next_prime(CHANNEL_TILE+1)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  $if CHANNEL_TILE > 1:
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*8)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(pooling_elements)
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(pooling_elements)
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .input_offset(${next_prime(CHANNEL_TILE*8)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(pooling_elements)
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(pooling_elements)
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .input_offset(${next_prime(CHANNEL_TILE)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*2)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*2)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+$if INCREMENTAL_TILE != 0:
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .input_offset(${next_prime(CHANNEL_TILE+1)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .qmin(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    ArgMaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+      .channels(${CHANNEL_TILE})
+      .qmax(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .input_offset(${next_prime(CHANNEL_TILE+1)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  $if CHANNEL_TILE > 1:
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*5)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .input_offset(${next_prime(CHANNEL_TILE*8)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .input_offset(${next_prime(CHANNEL_TILE)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*2)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*2)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .input_offset(${next_prime(CHANNEL_TILE+1)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      ArgMaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+        .channels(${CHANNEL_TILE})
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  $if CHANNEL_TILE > 1:
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .input_offset(${next_prime(CHANNEL_TILE*8)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmin) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .qmin(192)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmax) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .qmax(192)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_input_offset) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .input_offset(${CHANNEL_TILE})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmin) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .qmin(192)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+    TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmax) {
+      $if ISA_CHECK:
+        ${ISA_CHECK};
+      for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+        for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+          ArgMaxPoolMicrokernelTester()
+            .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .channels(channels)
+            .qmax(192)
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*2)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+        ArgMaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+$if INCREMENTAL_TILE == 0:
+  $MIN_POOLING, MAX_POOLING = 2, PRIMARY_TILE
+$else:
+  $MIN_POOLING, MAX_POOLING = PRIMARY_TILE + 1, PRIMARY_TILE + INCREMENTAL_TILE
+
+TEST(${TEST_NAME}, few_output_pixels) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*5+1)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_output_stride) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        ArgMaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+          .channels(channels)
+          .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_step) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
+          ArgMaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+            .step(step)
+            .channels(channels)
+            .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def generate_test_cases(ukernel, primary_tile, incremental_tile, channel_tile,
+                        isa):
+  """Generates all tests cases for a ARGMAXPOOL micro-kernel.
+
+  Args:
+    ukernel: C name of the micro-kernel function.
+    primary_tile: Number of rows (pixels) processed per one iteration of the
+                  primary outer loop of the micro-kernel.
+    incremental_tile: Number of rows (pixels) processed per one iteration of
+                      the incremental outer loop of the micro-kernel.
+    channel_tile: Number of channels processed per one iteration of the inner
+                  loops of the micro-kernel.
+    isa: instruction set required to run the micro-kernel. Generated unit test
+         will skip execution if the host processor doesn't support this ISA.
+
+  Returns:
+    Code for the test case.
+  """
+  _, test_name = ukernel.split("_", 1)
+  _, datatype, ukernel_type, _ = ukernel.split("_", 3)
+  test_args = [ukernel]
+  if not isa or isa == "psimd":
+    test_args.append("ArgMaxPoolMicrokernelTester::Variant::Scalar")
+  return xngen.preprocess(ARGMAXPOOL_TEST_TEMPLATE, {
+      "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
+      "TEST_ARGS": test_args,
+      "DATATYPE": datatype,
+      "PRIMARY_TILE": primary_tile,
+      "INCREMENTAL_TILE": incremental_tile,
+      "CHANNEL_TILE": channel_tile,
+      "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
+      "next_prime": next_prime,
+    })
+
+
+def main(args):
+  options = parser.parse_args(args)
+
+  with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
+    spec_yaml = yaml.safe_load(spec_file)
+    if not isinstance(spec_yaml, list):
+      raise ValueError("expected a list of micro-kernels in the spec")
+
+    tests = """\
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: {specification}
+//   Generator: {generator}
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/argmaxpool.h>
+#include "argmaxpool-microkernel-tester.h"
+""".format(specification=options.spec, generator=sys.argv[0])
+
+    for ukernel_spec in spec_yaml:
+      name = ukernel_spec["name"]
+      primary_tile, incremental_tile, channel_tile, arch, isa = \
+        split_ukernel_name(name)
+
+      # specification can override architecture
+      arch = ukernel_spec.get("arch", arch)
+
+      test_case = generate_test_cases(name, primary_tile, incremental_tile,
+                                      channel_tile, isa)
+      tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
+
+    with codecs.open(options.output, "w", encoding="utf-8") as output_file:
+      output_file.write(tests)
+
+
+if __name__ == "__main__":
+  main(sys.argv[1:])
diff --git a/tools/generate-maxpool-test.py b/tools/generate-maxpool-test.py
new file mode 100755
index 0000000..83edaf2
--- /dev/null
+++ b/tools/generate-maxpool-test.py
@@ -0,0 +1,1085 @@
+#!/usr/bin/env python
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import codecs
+import math
+import os
+import re
+import sys
+import yaml
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from primes import next_prime
+import xngen
+import xnncommon
+
+
+parser = argparse.ArgumentParser(description='MaxPool microkernel test generator')
+parser.add_argument("-s", "--spec", metavar="FILE", required=True,
+                    help="Specification (YAML) file")
+parser.add_argument("-o", "--output", metavar="FILE", required=True,
+                    help='Output (C++ source) file')
+parser.set_defaults(defines=list())
+
+
+def split_ukernel_name(name):
+  match = re.match(r"^xnn_(s8|u8|s16|f16|f32)_maxpool_ukernel_(\d+)p(\d+)x__(.+)_c(\d+)$", name)
+  if match is None:
+    raise ValueError("Unexpected microkernel name: " + name)
+
+  primary_tile = int(match.group(2))
+  incremental_tile = int(match.group(3))
+  channel_tile = int(match.group(5))
+
+  arch, isa = xnncommon.parse_target_name(target_name=match.group(4))
+  return primary_tile, incremental_tile, channel_tile, arch, isa
+
+
+MAXPOOL_TEST_TEMPLATE = """\
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .input_offset(${next_prime(CHANNEL_TILE+1)})
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .qmin(192)
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .qmax(192)
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .input_offset(${next_prime(CHANNEL_TILE+1)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+$if CHANNEL_TILE > 1:
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*8)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*8)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .input_offset(${next_prime(CHANNEL_TILE*2)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .qmin(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .qmax(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(pooling_elements)
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*2)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .input_offset(${next_prime(CHANNEL_TILE+1)})
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .qmin(192)
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  MaxPoolMicrokernelTester()
+    .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+    .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+    .channels(${CHANNEL_TILE})
+    .qmax(192)
+    .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(pooling_elements)
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .input_offset(${next_prime(CHANNEL_TILE+1)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+$if CHANNEL_TILE > 1:
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*5)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*8)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .input_offset(${next_prime(CHANNEL_TILE*2)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .qmin(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(channels)
+      .qmax(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*2)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .input_offset(${next_prime(CHANNEL_TILE+1)})
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .qmin(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    MaxPoolMicrokernelTester()
+      .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+      .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+      .channels(${CHANNEL_TILE})
+      .qmax(192)
+      .Test(${", ".join(TEST_ARGS)});
+  }
+}
+
+$if CHANNEL_TILE > 1:
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*8)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_input_offset) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${CHANNEL_TILE})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmin) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+  TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmax) {
+    $if ISA_CHECK:
+      ${ISA_CHECK};
+    for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+      for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+        MaxPoolMicrokernelTester()
+          .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .input_offset(${next_prime(CHANNEL_TILE*2)})
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmin(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+    for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+      MaxPoolMicrokernelTester()
+        .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+        .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+        .channels(channels)
+        .qmax(192)
+        .Test(${", ".join(TEST_ARGS)});
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_input_offset) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .input_offset(${next_prime(CHANNEL_TILE*5+1)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmin) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .qmin(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmax) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .qmax(192)
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_output_stride) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        MaxPoolMicrokernelTester()
+          .output_pixels(output_pixels)
+          .pooling_elements(pooling_elements)
+          .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+          .channels(channels)
+          .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+          .Test(${", ".join(TEST_ARGS)});
+      }
+    }
+  }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_step) {
+  $if ISA_CHECK:
+    ${ISA_CHECK};
+  for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+    for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+      for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+        for (size_t step = 2; step <= pooling_elements; step++) {
+          MaxPoolMicrokernelTester()
+            .output_pixels(output_pixels)
+            .pooling_elements(pooling_elements)
+            .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+            .step(step)
+            .channels(channels)
+            .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+            .Test(${", ".join(TEST_ARGS)});
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def generate_test_cases(ukernel, primary_tile, incremental_tile, channel_tile,
+                        isa):
+  """Generates all tests cases for a MAXPOOL micro-kernel.
+
+  Args:
+    ukernel: C name of the micro-kernel function.
+    primary_tile: Number of rows (pixels) processed per one iteration of the
+                  primary outer loop of the micro-kernel.
+    incremental_tile: Number of rows (pixels) processed per one iteration of
+                      the incremental outer loop of the micro-kernel.
+    channel_tile: Number of channels processed per one iteration of the inner
+                  loops of the micro-kernel.
+    isa: instruction set required to run the micro-kernel. Generated unit test
+         will skip execution if the host processor doesn't support this ISA.
+
+  Returns:
+    Code for the test case.
+  """
+  _, test_name = ukernel.split("_", 1)
+  _, datatype, ukernel_type, _ = ukernel.split("_", 3)
+  test_args = [ukernel]
+  if not isa or isa == "psimd":
+    test_args.append("MaxPoolMicrokernelTester::Variant::Scalar")
+  return xngen.preprocess(MAXPOOL_TEST_TEMPLATE, {
+      "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
+      "TEST_ARGS": test_args,
+      "DATATYPE": datatype,
+      "PRIMARY_TILE": primary_tile,
+      "INCREMENTAL_TILE": incremental_tile,
+      "CHANNEL_TILE": channel_tile,
+      "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
+      "next_prime": next_prime,
+    })
+
+
+def main(args):
+  options = parser.parse_args(args)
+
+  with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
+    spec_yaml = yaml.safe_load(spec_file)
+    if not isinstance(spec_yaml, list):
+      raise ValueError("expected a list of micro-kernels in the spec")
+
+    tests = """\
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+//   Specification: {specification}
+//   Generator: {generator}
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/maxpool.h>
+#include "maxpool-microkernel-tester.h"
+""".format(specification=options.spec, generator=sys.argv[0])
+
+    for ukernel_spec in spec_yaml:
+      name = ukernel_spec["name"]
+      primary_tile, incremental_tile, channel_tile, arch, isa = \
+        split_ukernel_name(name)
+
+      # specification can override architecture
+      arch = ukernel_spec.get("arch", arch)
+
+      test_case = generate_test_cases(name, primary_tile, incremental_tile,
+                                      channel_tile, isa)
+      tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
+
+    with codecs.open(options.output, "w", encoding="utf-8") as output_file:
+      output_file.write(tests)
+
+
+if __name__ == "__main__":
+  main(sys.argv[1:])