Refactor MaxPool and ArgMaxPool micro-kernels
- Support input_offset argument in MaxPool and ArgMaxPool micro-kernels
- Use input_offset to make indirection buffer independent on batch size
- Simplify and auto-generate unit tests
- Use more descriptive names for micro-kernel parameters
PiperOrigin-RevId: 281447682
diff --git a/BUILD.bazel b/BUILD.bazel
index 463b36a..31f569d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -74,9 +74,9 @@
]
SCALAR_UKERNELS = [
- "src/f32-argmaxpool/mp9p8q-scalar.c",
- "src/f32-argmaxpool/up4-scalar.c",
- "src/f32-argmaxpool/up9-scalar.c",
+ "src/f32-argmaxpool/9p8x-scalar-c1.c",
+ "src/f32-argmaxpool/4x-scalar-c1.c",
+ "src/f32-argmaxpool/9x-scalar-c1.c",
"src/f32-avgpool/mp9p8q-scalar.c",
"src/f32-avgpool/up9-scalar.c",
"src/f32-bilinear/scalar-c1.c",
@@ -136,7 +136,7 @@
"src/f32-gemminc/2x4-scalar.c",
"src/f32-gemminc/4x4-scalar.c",
"src/f32-hswish/scalar.c",
- "src/f32-maxpool/9p8q-scalar.c",
+ "src/f32-maxpool/9p8x-scalar-c1.c",
"src/f32-pavgpool/mp9p8q-scalar.c",
"src/f32-pavgpool/up9-scalar.c",
"src/f32-ppmm/2x4-scalar.c",
@@ -169,7 +169,7 @@
"src/q8-vadd/scalar.c",
"src/u8-clamp/scalar.c",
"src/u8-lut32norm/scalar.c",
- "src/u8-maxpool/9p8q-scalar.c",
+ "src/u8-maxpool/9p8x-scalar-c1.c",
"src/u8-rmax/scalar.c",
"src/x32-packx/x2-scalar.c",
"src/x32-packx/x3-scalar.c",
@@ -188,9 +188,9 @@
]
PSIMD_UKERNELS = [
- "src/f32-argmaxpool/mp9p8q-psimd.c",
- "src/f32-argmaxpool/up4-psimd.c",
- "src/f32-argmaxpool/up9-psimd.c",
+ "src/f32-argmaxpool/9p8x-psimd-c4.c",
+ "src/f32-argmaxpool/4x-psimd-c4.c",
+ "src/f32-argmaxpool/9x-psimd-c4.c",
"src/f32-avgpool/mp9p8q-psimd.c",
"src/f32-avgpool/up9-psimd.c",
"src/f32-bilinear/psimd-c4.c",
@@ -253,7 +253,7 @@
"src/f32-gemminc/6x8-psimd-splat.c",
"src/f32-gemminc/6x8s4-psimd.c",
"src/f32-hswish/psimd.c",
- "src/f32-maxpool/9p8q-psimd.c",
+ "src/f32-maxpool/9p8x-psimd-c4.c",
"src/f32-pavgpool/mp9p8q-psimd.c",
"src/f32-pavgpool/up9-psimd.c",
"src/f32-ppmm/4x8-psimd.c",
@@ -337,7 +337,7 @@
"src/q8-igemm/8x8-neon.c",
"src/q8-vadd/neon.c",
"src/u8-clamp/neon.c",
- "src/u8-maxpool/9p8q-neon.c",
+ "src/u8-maxpool/9p8x-neon-c16.c",
"src/u8-rmax/neon.c",
"src/x32-packx/x4-neon-st4.c",
"src/x32-pad/x2-neon.c",
@@ -476,7 +476,7 @@
"src/f32-gemminc/4x8-sse-load1.c",
"src/f32-gemminc/4x8s4-sse.c",
"src/f32-hswish/sse.c",
- "src/f32-maxpool/9p8q-sse.c",
+ "src/f32-maxpool/9p8x-sse-c4.c",
"src/f32-pavgpool/mp9p8q-sse.c",
"src/f32-pavgpool/up9-sse.c",
"src/f32-dwconv-spchw/3x3p1-sse.c",
@@ -491,9 +491,9 @@
]
SSE2_UKERNELS = [
- "src/f32-argmaxpool/mp9p8q-sse2.c",
- "src/f32-argmaxpool/up4-sse2.c",
- "src/f32-argmaxpool/up9-sse2.c",
+ "src/f32-argmaxpool/9p8x-sse2-c4.c",
+ "src/f32-argmaxpool/4x-sse2-c4.c",
+ "src/f32-argmaxpool/9x-sse2-c4.c",
"src/f32-prelu/sse2-2x4.c",
"src/f32-prelu/sse2-2x8.c",
"src/f32-sigmoid/sse2-p5-div-x8.c",
@@ -508,7 +508,7 @@
"src/q8-gemm/4x4c2-sse2.c",
"src/q8-vadd/sse2.c",
"src/u8-clamp/sse2.c",
- "src/u8-maxpool/9p8q-sse2.c",
+ "src/u8-maxpool/9p8x-sse2-c16.c",
"src/u8-rmax/sse2.c",
"src/x32-pad/x2-sse2.c",
"src/x32-zip/x2-sse2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 62d732c..4cdae83 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -225,7 +225,7 @@
src/f32-gemminc/2x4-scalar.c
src/f32-gemminc/4x4-scalar.c
src/f32-hswish/scalar.c
- src/f32-maxpool/9p8q-scalar.c
+ src/f32-maxpool/9p8x-scalar-c1.c
src/f32-pavgpool/mp9p8q-scalar.c
src/f32-pavgpool/up9-scalar.c
src/f32-ppmm/2x4-scalar.c
@@ -258,7 +258,7 @@
src/q8-vadd/scalar.c
src/u8-clamp/scalar.c
src/u8-lut32norm/scalar.c
- src/u8-maxpool/9p8q-scalar.c
+ src/u8-maxpool/9p8x-scalar-c1.c
src/u8-rmax/scalar.c
src/x32-packx/x2-scalar.c
src/x32-packx/x3-scalar.c
@@ -341,7 +341,7 @@
src/f32-gemminc/6x8-psimd-splat.c
src/f32-gemminc/6x8s4-psimd.c
src/f32-hswish/psimd.c
- src/f32-maxpool/9p8q-psimd.c
+ src/f32-maxpool/9p8x-psimd-c4.c
src/f32-pavgpool/mp9p8q-psimd.c
src/f32-pavgpool/up9-psimd.c
src/f32-ppmm/4x8-psimd.c
@@ -423,7 +423,7 @@
src/q8-gemm/8x8-neon.c
src/q8-vadd/neon.c
src/u8-clamp/neon.c
- src/u8-maxpool/9p8q-neon.c
+ src/u8-maxpool/9p8x-neon-c16.c
src/u8-rmax/neon.c
src/x32-packx/x4-neon-st4.c
src/x32-pad/x2-neon.c
@@ -558,7 +558,7 @@
src/f32-gemminc/4x8-sse-load1.c
src/f32-gemminc/4x8s4-sse.c
src/f32-hswish/sse.c
- src/f32-maxpool/9p8q-sse.c
+ src/f32-maxpool/9p8x-sse-c4.c
src/f32-pavgpool/mp9p8q-sse.c
src/f32-pavgpool/up9-sse.c
src/f32-dwconv-spchw/3x3p1-sse.c
@@ -589,7 +589,7 @@
src/q8-gemm/4x4c2-sse2.c
src/q8-vadd/sse2.c
src/u8-clamp/sse2.c
- src/u8-maxpool/9p8q-sse2.c
+ src/u8-maxpool/9p8x-sse2-c16.c
src/u8-rmax/sse2.c
src/x32-pad/x2-sse2.c
src/x32-zip/x2-sse2.c
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
index 821f456..7c03298 100755
--- a/scripts/generate-tests.sh
+++ b/scripts/generate-tests.sh
@@ -12,3 +12,9 @@
### Tests for packing micro-kernels
tools/generate-pack-test.py --spec test/x32-packx.yaml --output test/x32-packx.cc
+### Tests for MaxPool micro-kernels
+tools/generate-maxpool-test.py --spec test/u8-maxpool.yaml --output test/u8-maxpool.cc
+tools/generate-maxpool-test.py --spec test/f32-maxpool.yaml --output test/f32-maxpool.cc
+
+### Tests for ArgMaxPool micro-kernels
+tools/generate-argmaxpool-test.py --spec test/f32-argmaxpool.yaml --output test/f32-argmaxpool.cc
diff --git a/src/argmax-pooling-nhwc.c b/src/argmax-pooling-nhwc.c
index e4c359a..7c02073 100644
--- a/src/argmax-pooling-nhwc.c
+++ b/src/argmax-pooling-nhwc.c
@@ -208,20 +208,6 @@
argmax_pooling_op->output_width = compute_output_dimension(
argmax_pooling_op->padding_left + input_width + argmax_pooling_op->padding_right,
argmax_pooling_op->kernel_width);
- argmax_pooling_op->output = output;
-
- size_t valid_batch_size = 0;
- if (input == argmax_pooling_op->last_input &&
- input_height == argmax_pooling_op->last_input_height &&
- input_width == argmax_pooling_op->last_input_width)
- {
- valid_batch_size = argmax_pooling_op->valid_batch_size;
- if (batch_size <= valid_batch_size) {
- argmax_pooling_op->compute.range[0] = batch_size;
- argmax_pooling_op->state = xnn_run_state_ready;
- return xnn_status_success;
- }
- }
const size_t pooling_height = argmax_pooling_op->kernel_height;
const size_t pooling_width = argmax_pooling_op->kernel_width;
@@ -233,17 +219,26 @@
const size_t step_width = pooling_width;
const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
- // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
- const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
- const void** indirection_buffer = (const void**) xnn_reallocate_memory(argmax_pooling_op->indirection_buffer, indirection_buffer_size);
- if (indirection_buffer == NULL) {
- xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
- return xnn_status_out_of_memory;
+ if (input_height != argmax_pooling_op->last_input_height ||
+ input_width != argmax_pooling_op->last_input_width)
+ {
+ // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
+ const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
+
+ const void** indirection_buffer = (const void**) xnn_reallocate_memory(argmax_pooling_op->indirection_buffer, indirection_buffer_size);
+ if (indirection_buffer == NULL) {
+ xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
+ return xnn_status_out_of_memory;
+ }
+ argmax_pooling_op->indirection_buffer = indirection_buffer;
+
+ xnn_indirection_init_maxpool2d(argmax_pooling_op, step_height, step_width, 2 /* log2(sizeof(float)) */);
+
+ argmax_pooling_op->last_input = input;
+ argmax_pooling_op->last_input_height = input_height;
+ argmax_pooling_op->last_input_width = input_width;
}
- argmax_pooling_op->indirection_buffer = indirection_buffer;
-
- xnn_indirection_init_maxpool2d(argmax_pooling_op, valid_batch_size, step_height, step_width, 2 /* log2(sizeof(float)) */);
const size_t channels = argmax_pooling_op->channels;
@@ -255,21 +250,22 @@
const uint32_t qr = ukernel->qr;
const size_t multipass_adjustment = qr == 0 ? 0 : round_up(pooling_size - mr, qr) + mr - qr;
argmax_pooling_op->context.argmax_pooling = (struct argmax_pooling_context) {
- .indirect_input = indirection_buffer,
- .indirect_input_batch_stride = output_height * indirect_input_height_stride,
- .indirect_input_height_stride = indirect_input_height_stride,
- .output = output,
- .output_batch_stride = output_height * output_height_stride,
- .output_height_stride = output_height_stride,
- .output_width = output_width,
- .index = index,
- .index_batch_stride = output_height * index_height_stride,
- .index_height_stride = index_height_stride,
- .pooling_size = pooling_size,
- .channels = channels,
- .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
- .output_increment = output_width_stride - channels * sizeof(float),
- .params.f32 = argmax_pooling_op->f32_output_params,
+ .indirect_input = argmax_pooling_op->indirection_buffer,
+ .indirect_input_height_stride = indirect_input_height_stride,
+ .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) argmax_pooling_op->last_input),
+ .input_batch_stride = input_height * input_width * argmax_pooling_op->input_pixel_stride * sizeof(float),
+ .output = output,
+ .output_batch_stride = output_height * output_height_stride,
+ .output_height_stride = output_height_stride,
+ .output_width = output_width,
+ .index = index,
+ .index_batch_stride = output_height * index_height_stride,
+ .index_height_stride = index_height_stride,
+ .pooling_size = pooling_size,
+ .channels = channels,
+ .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
+ .output_increment = output_width_stride - channels * sizeof(float),
+ .params.f32 = argmax_pooling_op->f32_output_params,
};
argmax_pooling_op->compute.type = xnn_parallelization_type_2d;
argmax_pooling_op->compute.range[0] = batch_size;
@@ -284,10 +280,6 @@
}
argmax_pooling_op->state = xnn_run_state_ready;
- argmax_pooling_op->last_input = input;
- argmax_pooling_op->last_input_height = input_height;
- argmax_pooling_op->last_input_width = input_width;
- argmax_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
-
return xnn_status_success;
}
+
diff --git a/src/f32-argmaxpool/up4-psimd.c b/src/f32-argmaxpool/4x-psimd-c4.c
similarity index 69%
rename from src/f32-argmaxpool/up4-psimd.c
rename to src/f32-argmaxpool/4x-psimd-c4.c
index fe0f1fc..197ff72 100644
--- a/src/f32-argmaxpool/up4-psimd.c
+++ b/src/f32-argmaxpool/4x-psimd-c4.c
@@ -10,44 +10,46 @@
#include <xnnpack/argmaxpool.h>
-void xnn_f32_argmaxpool_ukernel_up4__psimd(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_4x__psimd_c4(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
uint32_t* index,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks <= 4);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements <= 4);
+ assert(channels != 0);
const psimd_f32 voutput_max = psimd_load_splat_f32(¶ms->scalar.max);
const psimd_f32 voutput_min = psimd_load_splat_f32(¶ms->scalar.min);
do {
- float* o = output;
- uint32_t* i = index;
-
const float* i0 = input[0];
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ if (pooling_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (pooling_elements <= 2) {
i2 = i0;
}
- if (ks != 4) {
+ if (pooling_elements != 4) {
i3 = i0;
}
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -74,12 +76,12 @@
const psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- psimd_store_f32(o, vout);
- o += 4;
- psimd_store_u32(i, vidx);
- i += 4;
+ psimd_store_f32(output, vout);
+ output += 4;
+ psimd_store_u32(index, vidx);
+ index += 4;
}
- if (k != 0) {
+ if (c != 0) {
const psimd_f32 vi0 = psimd_load_f32(i0);
const psimd_f32 vi1 = psimd_load_f32(i1);
const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -102,23 +104,22 @@
psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- if (k & 2) {
- psimd_store2_f32(o, vout);
- psimd_store2_u32(i, vidx);
+ if (c & 2) {
+ psimd_store2_f32(output, vout);
+ psimd_store2_u32(index, vidx);
vout = psimd_concat_hi_f32(vout, vout);
vidx = psimd_concat_hi_u32(vidx, vidx);
- o += 2;
- i += 2;
+ output += 2;
+ index += 2;
}
- if (k & 1) {
- psimd_store1_f32(o, vout);
- psimd_store1_u32(i, vidx);
- o += 1;
- i += 1;
+ if (c & 1) {
+ psimd_store1_f32(output, vout);
+ psimd_store1_u32(index, vidx);
+ output += 1;
+ index += 1;
}
}
input = (const float**) ((uintptr_t) input + input_increment);
- output = (float*) ((uintptr_t) o + output_increment);
- index = (uint32_t*) i;
- } while (--n != 0);
+ output = (float*) ((uintptr_t) output + output_increment);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/up4-scalar.c b/src/f32-argmaxpool/4x-scalar-c1.c
similarity index 61%
rename from src/f32-argmaxpool/up4-scalar.c
rename to src/f32-argmaxpool/4x-scalar-c1.c
index 8b668b0..999ff5e 100644
--- a/src/f32-argmaxpool/up4-scalar.c
+++ b/src/f32-argmaxpool/4x-scalar-c1.c
@@ -9,43 +9,45 @@
#include <xnnpack/math.h>
-void xnn_f32_argmaxpool_ukernel_up4__scalar(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_4x__scalar_c1(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
uint32_t* index,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks <= 4);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements <= 4);
+ assert(channels != 0);
const float voutput_max = params->scalar.max;
const float voutput_min = params->scalar.min;
do {
- float* o = output;
- uint32_t* i = index;
-
const float* i0 = input[0];
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ if (pooling_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (pooling_elements <= 2) {
i2 = i0;
}
- if (ks != 4) {
+ if (pooling_elements != 4) {
i3 = i0;
}
- size_t k = kc;
+ size_t c = channels;
do {
const float vi0 = *i0++;
const float vi1 = *i1++;
@@ -72,11 +74,10 @@
const float vout = math_max_f32(math_min_f32(vmax, voutput_max), voutput_min);
- *o++ = vout;
- *i++ = vidx;
- } while (--k != 0);
+ *output++ = vout;
+ *index++ = vidx;
+ } while (--c != 0);
input = (const float**) ((uintptr_t) input + input_increment);
- output = (float*) ((uintptr_t) o + output_increment);
- index = (uint32_t*) i;
- } while (--n != 0);
+ output = (float*) ((uintptr_t) output + output_increment);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/up4-sse2.c b/src/f32-argmaxpool/4x-sse2-c4.c
similarity index 70%
rename from src/f32-argmaxpool/up4-sse2.c
rename to src/f32-argmaxpool/4x-sse2-c4.c
index 1f0e3cf..14051c8 100644
--- a/src/f32-argmaxpool/up4-sse2.c
+++ b/src/f32-argmaxpool/4x-sse2-c4.c
@@ -10,44 +10,46 @@
#include <xnnpack/argmaxpool.h>
-void xnn_f32_argmaxpool_ukernel_up4__sse2(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_4x__sse2_c4(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
uint32_t* index,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks <= 4);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements <= 4);
+ assert(channels != 0);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
do {
- float* o = output;
- uint32_t* i = index;
-
const float* i0 = input[0];
const float* i1 = input[1];
const float* i2 = input[2];
const float* i3 = input[3];
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ if (pooling_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (pooling_elements <= 2) {
i2 = i0;
}
- if (ks != 4) {
+ if (pooling_elements != 4) {
i3 = i0;
}
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -74,12 +76,12 @@
const __m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
- _mm_storeu_ps(o, vout);
- o += 4;
- _mm_storeu_si128((__m128i*) i, vidx);
- i += 4;
+ _mm_storeu_ps(output, vout);
+ output += 4;
+ _mm_storeu_si128((__m128i*) index, vidx);
+ index += 4;
}
- if (k != 0) {
+ if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
@@ -102,23 +104,22 @@
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
- if (k & 2) {
- _mm_store_sd((double*) o, _mm_castps_pd(vout));
- _mm_storel_epi64((__m128i*) i, vidx);
+ if (c & 2) {
+ _mm_store_sd((double*) output, _mm_castps_pd(vout));
+ _mm_storel_epi64((__m128i*) index, vidx);
vout = _mm_movehl_ps(vout, vout);
vidx = _mm_unpackhi_epi64(vidx, vidx);
- o += 2;
- i += 2;
+ output += 2;
+ index += 2;
}
- if (k & 1) {
- _mm_store_ss(o, vout);
- *i = (uint32_t) _mm_cvtsi128_si32(vidx);
- o += 1;
- i += 1;
+ if (c & 1) {
+ _mm_store_ss(output, vout);
+ *index = (uint32_t) _mm_cvtsi128_si32(vidx);
+ output += 1;
+ index += 1;
}
}
input = (const float**) ((uintptr_t) input + input_increment);
- output = (float*) ((uintptr_t) o + output_increment);
- index = (uint32_t*) i;
- } while (--n != 0);
+ output = (float*) ((uintptr_t) output + output_increment);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/mp9p8q-psimd.c b/src/f32-argmaxpool/9p8x-psimd-c4.c
similarity index 82%
rename from src/f32-argmaxpool/mp9p8q-psimd.c
rename to src/f32-argmaxpool/9p8x-psimd-c4.c
index 2e2564a8..74c6cd3 100644
--- a/src/f32-argmaxpool/mp9p8q-psimd.c
+++ b/src/f32-argmaxpool/9p8x-psimd-c4.c
@@ -10,12 +10,13 @@
#include <xnnpack/argmaxpool.h>
-void xnn_f32_argmaxpool_ukernel_mp9p8q__psimd(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
- float* acc_buffer,
+ size_t input_offset,
+ float* accumulation_buffer,
uint32_t* index_buffer,
float* output,
uint32_t* index,
@@ -23,16 +24,16 @@
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks > 9);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements > 9);
+ assert(channels != 0);
const psimd_f32 voutput_max = psimd_load_splat_f32(¶ms->scalar.max);
const psimd_f32 voutput_min = psimd_load_splat_f32(¶ms->scalar.min);
do {
{
- float* ab = acc_buffer;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
const float* i0 = *input++;
@@ -44,8 +45,17 @@
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
- for (size_t k = 0; k < kc; k += 4) {
+ for (size_t c = 0; c < channels; c += 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -110,8 +120,8 @@
const psimd_u32 v8 = psimd_splat_u32(8);
psimd_u32 vidx0 = psimd_add_u32(v1, v8);
- size_t m = ks;
- for (m -= 9; m > 8; m -= 8) {
+ size_t k = pooling_elements;
+ for (k -= 9; k > 8; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
@@ -120,11 +130,19 @@
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
- float* ab = acc_buffer;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
- for (size_t k = 0; k < kc; k += 4) {
+ for (size_t c = 0; c < channels; c += 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -203,33 +221,41 @@
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
input = (const float**) ((uintptr_t) input + input_increment);
- if (m < 2) {
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m != 8) {
+ if (k != 8) {
i7 = i0;
}
- size_t k = kc;
- float* ab = acc_buffer;
+ size_t c = channels;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
- for (; k >= 4; k -= 4) {
+ for (; c >= 4; c -= 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -298,7 +324,7 @@
psimd_store_u32(i, vidx);
i += 4;
}
- if (k != 0) {
+ if (c != 0) {
const psimd_f32 vi0 = psimd_load_f32(i0);
const psimd_f32 vi1 = psimd_load_f32(i1);
const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -352,7 +378,7 @@
psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
psimd_store2_f32(o, vout);
psimd_store2_u32(i, vidx);
vout = psimd_concat_hi_f32(vout, vout);
@@ -360,7 +386,7 @@
o += 2;
i += 2;
}
- if (k & 1) {
+ if (c & 1) {
psimd_store1_f32(o, vout);
psimd_store1_u32(i, vidx);
o += 1;
@@ -371,5 +397,5 @@
output = (float*) ((uintptr_t) o + output_increment);
index = (uint32_t*) i;
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/mp9p8q-scalar.c b/src/f32-argmaxpool/9p8x-scalar-c1.c
similarity index 69%
rename from src/f32-argmaxpool/mp9p8q-scalar.c
rename to src/f32-argmaxpool/9p8x-scalar-c1.c
index 0f9f832..10bb965 100644
--- a/src/f32-argmaxpool/mp9p8q-scalar.c
+++ b/src/f32-argmaxpool/9p8x-scalar-c1.c
@@ -9,12 +9,13 @@
#include <xnnpack/math.h>
-void xnn_f32_argmaxpool_ukernel_mp9p8q__scalar(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
- float* acc_buffer,
+ size_t input_offset,
+ float* accumulation_buffer,
uint32_t* index_buffer,
float* output,
uint32_t* index,
@@ -22,16 +23,16 @@
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks > 9);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements > 9);
+ assert(channels != 0);
const float voutput_max = params->scalar.max;
const float voutput_min = params->scalar.min;
do {
{
- float* ab = acc_buffer;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
const float* i0 = *input++;
@@ -43,8 +44,17 @@
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
- size_t k = kc;
+ size_t c = channels;
do {
const float vi0 = *i0++;
const float vi1 = *i1++;
@@ -101,11 +111,11 @@
*ab++ = vmax;
*ib++ = vidx;
- } while (--k != 0);
+ } while (--c != 0);
}
uint32_t vidx0 = 9;
- size_t m = ks;
- for (m -= 9; m > 8; m -= 8) {
+ size_t k = pooling_elements;
+ for (k -= 9; k > 8; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
@@ -114,11 +124,19 @@
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
- float* ab = acc_buffer;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
- size_t k = kc;
+ size_t c = channels;
do {
const float vi0 = *i0++;
const float vi1 = *i1++;
@@ -174,7 +192,7 @@
*ab++ = vmax;
*ib++ = vidx;
- } while (--k != 0);
+ } while (--c != 0);
vidx0 += 8;
}
@@ -189,31 +207,39 @@
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
input = (const float**) ((uintptr_t) input + input_increment);
- if (m < 2) {
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m != 8) {
+ if (k != 8) {
i7 = i0;
}
- size_t k = kc;
- float* ab = acc_buffer;
+ size_t c = channels;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
do {
const float vi0 = *i0++;
@@ -272,10 +298,10 @@
*o++ = vout;
*i++ = vidx;
- } while (--k != 0);
+ } while (--c != 0);
}
output = (float*) ((uintptr_t) o + output_increment);
index = (uint32_t*) i;
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/mp9p8q-sse2.c b/src/f32-argmaxpool/9p8x-sse2-c4.c
similarity index 84%
rename from src/f32-argmaxpool/mp9p8q-sse2.c
rename to src/f32-argmaxpool/9p8x-sse2-c4.c
index 31b55bf..30a3443 100644
--- a/src/f32-argmaxpool/mp9p8q-sse2.c
+++ b/src/f32-argmaxpool/9p8x-sse2-c4.c
@@ -10,12 +10,13 @@
#include <xnnpack/argmaxpool.h>
-void xnn_f32_argmaxpool_ukernel_mp9p8q__sse2(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
- float* acc_buffer,
+ size_t input_offset,
+ float* accumulation_buffer,
uint32_t* index_buffer,
float* output,
uint32_t* index,
@@ -23,16 +24,16 @@
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks > 9);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements > 9);
+ assert(channels != 0);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
do {
{
- float* ab = acc_buffer;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
const float* i0 = *input++;
@@ -44,8 +45,17 @@
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
- for (size_t k = 0; k < kc; k += 4) {
+ for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -110,8 +120,8 @@
const __m128i v8 = _mm_set1_epi32(8);
__m128i vidx0 = _mm_add_epi32(v1, v8);
- size_t m = ks;
- for (m -= 9; m > 8; m -= 8) {
+ size_t k = pooling_elements;
+ for (k -= 9; k > 8; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
@@ -120,11 +130,19 @@
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
- float* ab = acc_buffer;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
- for (size_t k = 0; k < kc; k += 4) {
+ for (size_t c = 0; c < channels; c += 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -203,33 +221,41 @@
const float* i5 = input[5];
const float* i6 = input[6];
const float* i7 = input[7];
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
input = (const float**) ((uintptr_t) input + input_increment);
- if (m < 2) {
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m != 8) {
+ if (k != 8) {
i7 = i0;
}
- size_t k = kc;
- float* ab = acc_buffer;
+ size_t c = channels;
+ float* ab = accumulation_buffer;
uint32_t* ib = index_buffer;
- for (; k >= 4; k -= 4) {
+ for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -298,7 +324,7 @@
_mm_storeu_si128((__m128i*) i, vidx);
i += 4;
}
- if (k != 0) {
+ if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
@@ -352,7 +378,7 @@
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
_mm_store_sd((double*) o, _mm_castps_pd(vout));
_mm_storel_epi64((__m128i*) i, vidx);
vout = _mm_movehl_ps(vout, vout);
@@ -360,7 +386,7 @@
o += 2;
i += 2;
}
- if (k & 1) {
+ if (c & 1) {
_mm_store_ss(o, vout);
*i = (uint32_t) _mm_cvtsi128_si32(vidx);
o += 1;
@@ -371,5 +397,5 @@
output = (float*) ((uintptr_t) o + output_increment);
index = (uint32_t*) i;
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/up9-psimd.c b/src/f32-argmaxpool/9x-psimd-c4.c
similarity index 75%
rename from src/f32-argmaxpool/up9-psimd.c
rename to src/f32-argmaxpool/9x-psimd-c4.c
index 73d7931..4b642d5 100644
--- a/src/f32-argmaxpool/up9-psimd.c
+++ b/src/f32-argmaxpool/9x-psimd-c4.c
@@ -10,28 +10,26 @@
#include <xnnpack/argmaxpool.h>
-void xnn_f32_argmaxpool_ukernel_up9__psimd(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_9x__psimd_c4(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
uint32_t* index,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks <= 9);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements <= 9);
+ assert(channels != 0);
const psimd_f32 voutput_max = psimd_load_splat_f32(¶ms->scalar.max);
const psimd_f32 voutput_min = psimd_load_splat_f32(¶ms->scalar.min);
do {
- float* o = output;
- uint32_t* i = index;
-
const float* i0 = input[0];
const float* i1 = input[1];
const float* i2 = input[2];
@@ -41,33 +39,42 @@
const float* i6 = input[6];
const float* i7 = input[7];
const float* i8 = input[8];
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
+ if (pooling_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (pooling_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (pooling_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (pooling_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (pooling_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (pooling_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (pooling_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (pooling_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -124,12 +131,12 @@
const psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- psimd_store_f32(o, vout);
- o += 4;
- psimd_store_u32(i, vidx);
- i += 4;
+ psimd_store_f32(output, vout);
+ output += 4;
+ psimd_store_u32(index, vidx);
+ index += 4;
}
- if (k != 0) {
+ if (c != 0) {
const psimd_f32 vi0 = psimd_load_f32(i0);
const psimd_f32 vi1 = psimd_load_f32(i1);
const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -177,23 +184,22 @@
psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- if (k & 2) {
- psimd_store2_f32(o, vout);
- psimd_store2_u32(i, vidx);
+ if (c & 2) {
+ psimd_store2_f32(output, vout);
+ psimd_store2_u32(index, vidx);
vout = psimd_concat_hi_f32(vout, vout);
vidx = psimd_concat_hi_u32(vidx, vidx);
- o += 2;
- i += 2;
+ output += 2;
+ index += 2;
}
- if (k & 1) {
- psimd_store1_f32(o, vout);
- psimd_store1_u32(i, vidx);
- o += 1;
- i += 1;
+ if (c & 1) {
+ psimd_store1_f32(output, vout);
+ psimd_store1_u32(index, vidx);
+ output += 1;
+ index += 1;
}
}
input = (const float**) ((uintptr_t) input + input_increment);
- output = (float*) ((uintptr_t) o + output_increment);
- index = (uint32_t*) i;
- } while (--n != 0);
+ output = (float*) ((uintptr_t) output + output_increment);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/up9-scalar.c b/src/f32-argmaxpool/9x-scalar-c1.c
similarity index 62%
rename from src/f32-argmaxpool/up9-scalar.c
rename to src/f32-argmaxpool/9x-scalar-c1.c
index 08b6fa0..7324e39 100644
--- a/src/f32-argmaxpool/up9-scalar.c
+++ b/src/f32-argmaxpool/9x-scalar-c1.c
@@ -9,28 +9,26 @@
#include <xnnpack/math.h>
-void xnn_f32_argmaxpool_ukernel_up9__scalar(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_9x__scalar_c1(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
uint32_t* index,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks <= 9);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements <= 9);
+ assert(channels != 0);
const float voutput_max = params->scalar.max;
const float voutput_min = params->scalar.min;
do {
- float* o = output;
- uint32_t* i = index;
-
const float* i0 = input[0];
const float* i1 = input[1];
const float* i2 = input[2];
@@ -40,32 +38,41 @@
const float* i6 = input[6];
const float* i7 = input[7];
const float* i8 = input[8];
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
+ if (pooling_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (pooling_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (pooling_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (pooling_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (pooling_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (pooling_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (pooling_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (pooling_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
+ size_t c = channels;
do {
const float vi0 = *i0++;
const float vi1 = *i1++;
@@ -122,11 +129,10 @@
const float vout = math_max_f32(math_min_f32(vmax, voutput_max), voutput_min);
- *o++ = vout;
- *i++ = vidx;
- } while (--k != 0);
+ *output++ = vout;
+ *index++ = vidx;
+ } while (--c != 0);
input = (const float**) ((uintptr_t) input + input_increment);
- output = (float*) ((uintptr_t) o + output_increment);
- index = (uint32_t*) i;
- } while (--n != 0);
+ output = (float*) ((uintptr_t) output + output_increment);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-argmaxpool/up9-sse2.c b/src/f32-argmaxpool/9x-sse2-c4.c
similarity index 83%
rename from src/f32-argmaxpool/up9-sse2.c
rename to src/f32-argmaxpool/9x-sse2-c4.c
index acd8609..c0d2075 100644
--- a/src/f32-argmaxpool/up9-sse2.c
+++ b/src/f32-argmaxpool/9x-sse2-c4.c
@@ -10,21 +10,22 @@
#include <xnnpack/argmaxpool.h>
-void xnn_f32_argmaxpool_ukernel_up9__sse2(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_argmaxpool_ukernel_9x__sse2_c4(
+ size_t output_pixels,
+ size_t pooling_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
uint32_t* index,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(ks <= 9);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(pooling_elements != 0);
+ assert(pooling_elements <= 9);
+ assert(channels != 0);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
@@ -41,33 +42,42 @@
const float* i6 = input[6];
const float* i7 = input[7];
const float* i8 = input[8];
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
+ if (pooling_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (pooling_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (pooling_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (pooling_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (pooling_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (pooling_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (pooling_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (pooling_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -129,7 +139,7 @@
_mm_storeu_si128((__m128i*) i, vidx);
i += 4;
}
- if (k != 0) {
+ if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
@@ -177,7 +187,7 @@
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
_mm_store_sd((double*) o, _mm_castps_pd(vout));
_mm_storel_epi64((__m128i*) i, vidx);
vout = _mm_movehl_ps(vout, vout);
@@ -185,7 +195,7 @@
o += 2;
i += 2;
}
- if (k & 1) {
+ if (c & 1) {
_mm_store_ss(o, vout);
*i = (uint32_t) _mm_cvtsi128_si32(vidx);
o += 1;
@@ -195,5 +205,5 @@
input = (const float**) ((uintptr_t) input + input_increment);
output = (float*) ((uintptr_t) o + output_increment);
index = (uint32_t*) i;
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-maxpool/9p8q-psimd.c b/src/f32-maxpool/9p8x-psimd-c4.c
similarity index 76%
rename from src/f32-maxpool/9p8q-psimd.c
rename to src/f32-maxpool/9p8x-psimd-c4.c
index c973db3..a502c90 100644
--- a/src/f32-maxpool/9p8q-psimd.c
+++ b/src/f32-maxpool/9p8x-psimd-c4.c
@@ -10,19 +10,20 @@
#include <xnnpack/maxpool.h>
-void xnn_f32_maxpool_ukernel_9p8q__psimd(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_maxpool_ukernel_9p8x__psimd_c4(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
const psimd_f32 voutput_max = psimd_load_splat_f32(¶ms->scalar.max);
const psimd_f32 voutput_min = psimd_load_splat_f32(¶ms->scalar.min);
@@ -38,33 +39,42 @@
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (kernel_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (kernel_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (kernel_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (kernel_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (kernel_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (kernel_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (kernel_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -97,7 +107,7 @@
psimd_store_f32(o, vout);
o += 4;
}
- if (k != 0) {
+ if (c != 0) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -127,19 +137,19 @@
const psimd_f32 vmax = psimd_max_f32(vmax2345, vmax01678);
psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
psimd_store2_f32(o, vout);
vout = psimd_concat_hi_f32(vout, vout);
o += 2;
}
- if (k & 1) {
+ if (c & 1) {
psimd_store1_f32(o, vout);
o += 1;
}
}
}
- for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
@@ -148,31 +158,39 @@
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
- if (m < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m < 8) {
+ if (k < 8) {
i7 = i0;
}
o = output;
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const psimd_f32 vi0 = psimd_load_f32(i0);
i0 += 4;
const psimd_f32 vi1 = psimd_load_f32(i1);
@@ -204,7 +222,7 @@
psimd_store_f32(o, vout);
o += 4;
}
- if (k != 0) {
+ if (c != 0) {
const psimd_f32 vi0 = psimd_load_f32(i0);
const psimd_f32 vi1 = psimd_load_f32(i1);
const psimd_f32 vi2 = psimd_load_f32(i2);
@@ -225,12 +243,12 @@
const psimd_f32 vmax = psimd_max_f32(vmax2345, vmax0167);
psimd_f32 vout = psimd_max_f32(psimd_min_f32(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
psimd_store2_f32(o, vout);
vout = psimd_concat_hi_f32(vout, vout);
o += 2;
}
- if (k & 1) {
+ if (c & 1) {
psimd_store1_f32(o, vout);
o += 1;
}
@@ -238,5 +256,5 @@
}
input = (const float**) ((uintptr_t) input + input_increment);
output = (float*) ((uintptr_t) o + output_increment);
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-maxpool/9p8q-scalar.c b/src/f32-maxpool/9p8x-scalar-c1.c
similarity index 64%
rename from src/f32-maxpool/9p8q-scalar.c
rename to src/f32-maxpool/9p8x-scalar-c1.c
index 1108170..fec4483 100644
--- a/src/f32-maxpool/9p8q-scalar.c
+++ b/src/f32-maxpool/9p8x-scalar-c1.c
@@ -9,19 +9,20 @@
#include <xnnpack/math.h>
-void xnn_f32_maxpool_ukernel_9p8q__scalar(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_maxpool_ukernel_9p8x__scalar_c1(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
const float voutput_min = params->scalar.min;
const float voutput_max = params->scalar.max;
@@ -37,32 +38,41 @@
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (kernel_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (kernel_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (kernel_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (kernel_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (kernel_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (kernel_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (kernel_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
+ size_t c = channels;
do {
const float vi0 = *i0++;
const float vi1 = *i1++;
@@ -87,10 +97,10 @@
vout = math_min_f32(vout, voutput_max);
*o++ = vout;
- } while (--k != 0);
+ } while (--c != 0);
}
- for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
@@ -99,30 +109,38 @@
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
- if (m < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m < 8) {
+ if (k < 8) {
i7 = i0;
}
o = output;
- size_t k = kc;
+ size_t c = channels;
do {
const float vi0 = *i0++;
const float vi1 = *i1++;
@@ -147,9 +165,9 @@
vout = math_min_f32(vout, voutput_max);
*o++ = vout;
- } while (--k != 0);
+ } while (--c != 0);
}
input = (const float**) ((uintptr_t) input + input_increment);
output = (float*) ((uintptr_t) o + output_increment);
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/f32-maxpool/9p8q-sse.c b/src/f32-maxpool/9p8x-sse-c4.c
similarity index 75%
rename from src/f32-maxpool/9p8q-sse.c
rename to src/f32-maxpool/9p8x-sse-c4.c
index ecf3f76..54fb78d 100644
--- a/src/f32-maxpool/9p8q-sse.c
+++ b/src/f32-maxpool/9p8x-sse-c4.c
@@ -10,19 +10,20 @@
#include <xnnpack/maxpool.h>
-void xnn_f32_maxpool_ukernel_9p8q__sse(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_f32_maxpool_ukernel_9p8x__sse_c4(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
const float** input,
+ size_t input_offset,
float* output,
size_t input_increment,
size_t output_increment,
const union xnn_f32_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
const __m128 voutput_max = _mm_load_ps(params->sse.max);
const __m128 voutput_min = _mm_load_ps(params->sse.min);
@@ -38,33 +39,42 @@
const float* i6 = *input++;
const float* i7 = *input++;
const float* i8 = *input++;
- if (ks < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ i8 = (const float*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (kernel_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (kernel_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (kernel_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (kernel_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (kernel_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (kernel_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (kernel_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -97,7 +107,7 @@
_mm_storeu_ps(o, vout);
o += 4;
}
- if (k != 0) {
+ if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -127,19 +137,19 @@
const __m128 vmax = _mm_max_ps(vmax2345, vmax01678);
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
_mm_storel_pi((__m64*) o, vout);
o += 2;
vout = _mm_movehl_ps(vout, vout);
}
- if (k & 1) {
+ if (c & 1) {
_mm_store_ss(o, vout);
o += 1;
}
}
}
- for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const float* i0 = *input++;
const float* i1 = *input++;
const float* i2 = *input++;
@@ -148,31 +158,39 @@
const float* i5 = *input++;
const float* i6 = *input++;
const float* i7 = *input++;
- if (m < 2) {
+ i0 = (const float*) ((uintptr_t) i0 + input_offset);
+ i1 = (const float*) ((uintptr_t) i1 + input_offset);
+ i2 = (const float*) ((uintptr_t) i2 + input_offset);
+ i3 = (const float*) ((uintptr_t) i3 + input_offset);
+ i4 = (const float*) ((uintptr_t) i4 + input_offset);
+ i5 = (const float*) ((uintptr_t) i5 + input_offset);
+ i6 = (const float*) ((uintptr_t) i6 + input_offset);
+ i7 = (const float*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m < 8) {
+ if (k < 8) {
i7 = i0;
}
o = output;
- size_t k = kc;
- for (; k >= 4; k -= 4) {
+ size_t c = channels;
+ for (; c >= 4; c -= 4) {
const __m128 vi0 = _mm_loadu_ps(i0);
i0 += 4;
const __m128 vi1 = _mm_loadu_ps(i1);
@@ -204,7 +222,7 @@
_mm_storeu_ps(o, vout);
o += 4;
}
- if (k != 0) {
+ if (c != 0) {
const __m128 vi0 = _mm_loadu_ps(i0);
const __m128 vi1 = _mm_loadu_ps(i1);
const __m128 vi2 = _mm_loadu_ps(i2);
@@ -225,12 +243,12 @@
const __m128 vmax = _mm_max_ps(vmax2345, vmax0167);
__m128 vout = _mm_max_ps(_mm_min_ps(vmax, voutput_max), voutput_min);
- if (k & 2) {
+ if (c & 2) {
_mm_storel_pi((__m64*) o, vout);
o += 2;
vout = _mm_movehl_ps(vout, vout);
}
- if (k & 1) {
+ if (c & 1) {
_mm_store_ss(o, vout);
o += 1;
}
@@ -238,5 +256,5 @@
}
input = (const float**) ((uintptr_t) input + input_increment);
output = (float*) ((uintptr_t) o + output_increment);
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/indirection.c b/src/indirection.c
index 1244eb6..a29e5c1 100644
--- a/src/indirection.c
+++ b/src/indirection.c
@@ -252,7 +252,6 @@
void xnn_indirection_init_maxpool2d(
xnn_operator_t op,
- size_t batch_start,
size_t step_height,
size_t step_width,
uint32_t log2_element_size)
@@ -260,7 +259,6 @@
const void** indirection_buffer = op->indirection_buffer;
const void* input = op->input;
const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
- const size_t batch_size = op->batch_size;
const size_t input_height = op->input_height;
const size_t input_width = op->input_width;
const size_t output_height = op->output_height;
@@ -274,18 +272,16 @@
const size_t input_padding_top = op->padding_top;
const size_t input_padding_left = op->padding_left;
- for (size_t image = batch_start; image < batch_size; image++) {
- for (size_t output_y = 0; output_y < output_height; output_y++) {
- for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
- const size_t input_y = doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top);
- const size_t clamped_input_y = min(input_y, input_height - 1);
- for (size_t output_x = 0; output_x < output_width; output_x++) {
- for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
- const size_t input_x = doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left);
- const size_t clamped_input_x = min(input_x, input_width - 1);
- const size_t index = (image * output_height + output_y) * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
- indirection_buffer[index] = input + ((image * input_height + clamped_input_y) * input_width + clamped_input_x) * input_pixel_stride;
- }
+ for (size_t output_y = 0; output_y < output_height; output_y++) {
+ for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
+ const size_t input_y = doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top);
+ const size_t clamped_input_y = min(input_y, input_height - 1);
+ for (size_t output_x = 0; output_x < output_width; output_x++) {
+ for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
+ const size_t input_x = doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left);
+ const size_t clamped_input_x = min(input_x, input_width - 1);
+ const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
+ indirection_buffer[index] = input + (clamped_input_y * input_width + clamped_input_x) * input_pixel_stride;
}
}
}
diff --git a/src/init.c b/src/init.c
index dbcae34..083b978 100644
--- a/src/init.c
+++ b/src/init.c
@@ -109,7 +109,7 @@
/**************************** U8 micro-kernels ****************************/
#ifndef XNN_NO_U8_OPERATORS
xnn_params.u8.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__neon,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__neon_c16,
.mr = 9,
.qr = 8,
};
@@ -178,20 +178,20 @@
.mr = 7,
};
xnn_params.f32.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__psimd,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
.mr = 9,
.qr = 8,
};
xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__psimd,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
.mr = 4,
};
xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__psimd,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
.mr = 9,
};
xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
- .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__psimd,
+ .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
.mr = 9,
.qr = 8,
};
@@ -268,7 +268,7 @@
/**************************** U8 micro-kernels ****************************/
#ifndef XNN_NO_U8_OPERATORS
xnn_params.u8.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__neon,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__neon_c16,
.mr = 9,
.qr = 8,
};
@@ -440,20 +440,20 @@
.mr = 7,
};
xnn_params.f32.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__psimd,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
.mr = 9,
.qr = 8,
};
xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__psimd,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
.mr = 4,
};
xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__psimd,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
.mr = 9,
};
xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
- .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__psimd,
+ .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
.mr = 9,
.qr = 8,
};
@@ -588,7 +588,7 @@
/**************************** U8 micro-kernels ****************************/
#ifndef XNN_NO_U8_OPERATORS
xnn_params.u8.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__sse2,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__sse2_c16,
.mr = 9,
.qr = 8,
};
@@ -658,20 +658,20 @@
.mr = 7,
};
xnn_params.f32.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__sse,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__sse_c4,
.mr = 9,
.qr = 8,
};
xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__sse2,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
.mr = 4,
};
xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__sse2,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
.mr = 9,
};
xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
- .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__sse2,
+ .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
.mr = 9,
.qr = 8,
};
@@ -778,7 +778,7 @@
/**************************** U8 micro-kernels ****************************/
#ifndef XNN_NO_U8_OPERATORS
xnn_params.u8.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__scalar,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__scalar_c1,
.mr = 9,
.qr = 8,
};
@@ -860,20 +860,20 @@
.mr = 7,
};
xnn_params.f32.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__psimd,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
.mr = 9,
.qr = 8,
};
xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__psimd,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
.mr = 4,
};
xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__psimd,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
.mr = 9,
};
xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
- .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__psimd,
+ .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
.mr = 9,
.qr = 8,
};
@@ -956,7 +956,7 @@
/**************************** U8 micro-kernels ****************************/
#ifndef XNN_NO_U8_OPERATORS
xnn_params.u8.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8q__scalar,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__scalar_c1,
.mr = 9,
.qr = 8,
};
@@ -1036,20 +1036,20 @@
.mr = 7,
};
xnn_params.f32.maxpool = (struct maxpool_parameters) {
- .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8q__scalar,
+ .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__scalar_c1,
.mr = 9,
.qr = 8,
};
xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up4__scalar,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
.mr = 4,
};
xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
- .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_up9__scalar,
+ .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
.mr = 9,
};
xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
- .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_mp9p8q__scalar,
+ .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
.mr = 9,
.qr = 8,
};
diff --git a/src/max-pooling-nhwc.c b/src/max-pooling-nhwc.c
index 8f13af6..46ceae4 100644
--- a/src/max-pooling-nhwc.c
+++ b/src/max-pooling-nhwc.c
@@ -305,140 +305,19 @@
return status;
}
-enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
- xnn_operator_t max_pooling_op,
- size_t batch_size,
- size_t input_height,
- size_t input_width,
- const uint8_t* input,
- uint8_t* output,
- pthreadpool_t threadpool)
+static enum xnn_status setup_max_pooling2d(
+ xnn_operator_t max_pooling_op,
+ size_t batch_size,
+ size_t input_height,
+ size_t input_width,
+ const void* input,
+ void* output,
+ uint32_t log2_input_element_size,
+ uint32_t log2_output_element_size,
+ struct maxpool_parameters maxpool[restrict static 1],
+ const void* params,
+ size_t num_threads)
{
- if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_u8) {
- xnn_log_error("failed to setup Max Pooling (NHWC, U8) operator: operator type mismatch");
- return xnn_status_invalid_parameter;
- }
- max_pooling_op->state = xnn_run_state_invalid;
-
- if (!xnn_params.initialized) {
- xnn_log_error("failed to setup Max Pooling operator: XNNPACK is not initialized");
- return xnn_status_uninitialized;
- }
-
- if (input_width == 0 || input_height == 0) {
- xnn_log_error(
- "failed to setup Max Pooling operator with %zux%zu input: input dimensions must be non-zero",
- input_width, input_height);
- return xnn_status_invalid_parameter;
- }
-
- if (batch_size == 0) {
- max_pooling_op->state = xnn_run_state_skip;
- return xnn_status_success;
- }
-
- max_pooling_op->batch_size = batch_size;
- max_pooling_op->input_height = input_height;
- max_pooling_op->input_width = input_width;
- max_pooling_op->input = input;
-
- max_pooling_op->output_height = compute_output_dimension(
- max_pooling_op->padding_top + input_height + max_pooling_op->padding_bottom,
- max_pooling_op->kernel_height,
- max_pooling_op->dilation_height,
- max_pooling_op->stride_height);
- max_pooling_op->output_width = compute_output_dimension(
- max_pooling_op->padding_left + input_width + max_pooling_op->padding_right,
- max_pooling_op->kernel_width,
- max_pooling_op->dilation_width,
- max_pooling_op->stride_width);
- max_pooling_op->output = output;
-
- size_t valid_batch_size = 0;
- if (input == max_pooling_op->last_input &&
- input_height == max_pooling_op->last_input_height &&
- input_width == max_pooling_op->last_input_width)
- {
- valid_batch_size = max_pooling_op->valid_batch_size;
- if (batch_size <= valid_batch_size) {
- max_pooling_op->compute.range[0] = batch_size;
- max_pooling_op->state = xnn_run_state_ready;
- return xnn_status_success;
- }
- }
-
- const size_t pooling_height = max_pooling_op->kernel_height;
- const size_t pooling_width = max_pooling_op->kernel_width;
- const size_t pooling_size = pooling_height * pooling_width;
- const size_t output_height = max_pooling_op->output_height;
- const size_t output_width = max_pooling_op->output_width;
- // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
- const uint32_t mr = xnn_params.u8.maxpool.mr;
-
- const size_t step_width =
- max_pooling_op->dilation_width > 1 ? pooling_width : min(max_pooling_op->stride_width, pooling_width);
- const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
- const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
-
- const void** indirection_buffer = (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
- if (indirection_buffer == NULL) {
- xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
- return xnn_status_out_of_memory;
- }
- max_pooling_op->indirection_buffer = indirection_buffer;
-
- xnn_indirection_init_maxpool2d(max_pooling_op, valid_batch_size, step_height, step_width, 0 /* log2(sizeof(uint8_t)) */);
-
- const uint32_t qr = xnn_params.u8.maxpool.qr;
- const size_t channels = max_pooling_op->channels;
-
- const size_t indirect_input_height_stride = step_height * sizeof(void*);
- const size_t output_width_stride = max_pooling_op->output_pixel_stride * sizeof(uint8_t);
- const size_t output_height_stride = output_width * output_width_stride;
- const size_t multipass_adjustment = round_up(doz(pooling_size, mr), qr) + mr;
-
- max_pooling_op->context.max_pooling = (struct max_pooling_context) {
- .indirect_input = indirection_buffer,
- .indirect_input_batch_stride = output_height * indirect_input_height_stride,
- .indirect_input_height_stride = indirect_input_height_stride,
- .output = output,
- .output_batch_stride = output_height * output_height_stride,
- .output_height_stride = output_height_stride,
- .output_width = output_width,
- .pooling_size = pooling_size,
- .channels = channels,
- .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
- .output_increment = output_width_stride - channels * sizeof(uint8_t),
- .params.u8 = max_pooling_op->u8_output_params,
- .ukernel = xnn_params.u8.maxpool.ukernel,
- };
- max_pooling_op->compute.type = xnn_parallelization_type_2d;
- max_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_max_pooling;
- max_pooling_op->compute.range[0] = batch_size;
- max_pooling_op->compute.range[1] = output_height;
- max_pooling_op->state = xnn_run_state_ready;
-
- max_pooling_op->last_input = input;
- max_pooling_op->last_input_height = input_height;
- max_pooling_op->last_input_width = input_width;
- max_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
-
- return xnn_status_success;
-}
-
-enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
- xnn_operator_t max_pooling_op,
- size_t batch_size,
- size_t input_height,
- size_t input_width,
- const float* input,
- float* output,
- pthreadpool_t threadpool)
-{
- if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f32) {
- xnn_log_error("failed to setup Max Pooling (NHWC, F32) operator: operator type mismatch");
- return xnn_status_invalid_parameter;
- }
max_pooling_op->state = xnn_run_state_invalid;
if (!xnn_params.initialized) {
@@ -459,7 +338,6 @@
return xnn_status_success;
}
- max_pooling_op->batch_size = batch_size;
max_pooling_op->input_height = input_height;
max_pooling_op->input_width = input_width;
max_pooling_op->input = input;
@@ -474,76 +352,118 @@
max_pooling_op->kernel_width,
max_pooling_op->dilation_width,
max_pooling_op->stride_width);
- max_pooling_op->output = output;
-
- size_t valid_batch_size = 0;
- if (input == max_pooling_op->last_input &&
- input_height == max_pooling_op->last_input_height &&
- input_width == max_pooling_op->last_input_width)
- {
- valid_batch_size = max_pooling_op->valid_batch_size;
- if (batch_size <= valid_batch_size) {
- max_pooling_op->compute.range[0] = batch_size;
- max_pooling_op->state = xnn_run_state_ready;
- return xnn_status_success;
- }
- }
const size_t pooling_height = max_pooling_op->kernel_height;
const size_t pooling_width = max_pooling_op->kernel_width;
const size_t pooling_size = pooling_height * pooling_width;
const size_t output_height = max_pooling_op->output_height;
const size_t output_width = max_pooling_op->output_width;
- // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
- const uint32_t mr = xnn_params.f32.maxpool.mr;
+ const uint32_t mr = maxpool->mr;
const size_t step_width =
max_pooling_op->dilation_width > 1 ? pooling_width : min(max_pooling_op->stride_width, pooling_width);
const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
- const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
- const void** indirection_buffer = (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
- if (indirection_buffer == NULL) {
- xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
- return xnn_status_out_of_memory;
+ if (input_height != max_pooling_op->last_input_height ||
+ input_width != max_pooling_op->last_input_width)
+ {
+ // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
+ const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + output_height * step_height);
+ const void** indirection_buffer = (const void**) xnn_reallocate_memory(max_pooling_op->indirection_buffer, indirection_buffer_size);
+ if (indirection_buffer == NULL) {
+ xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
+ return xnn_status_out_of_memory;
+ }
+ max_pooling_op->indirection_buffer = indirection_buffer;
+
+ xnn_indirection_init_maxpool2d(max_pooling_op, step_height, step_width, log2_input_element_size);
+
+ max_pooling_op->last_input = input;
+ max_pooling_op->last_input_height = input_height;
+ max_pooling_op->last_input_width = input_width;
}
- max_pooling_op->indirection_buffer = indirection_buffer;
- xnn_indirection_init_maxpool2d(max_pooling_op, valid_batch_size, step_height, step_width, 2 /* log2(sizeof(float)) */);
-
- const uint32_t qr = xnn_params.f32.maxpool.qr;
+ const uint32_t qr = maxpool->qr;
const size_t channels = max_pooling_op->channels;
const size_t indirect_input_height_stride = step_height * sizeof(void*);
- const size_t output_width_stride = max_pooling_op->output_pixel_stride * sizeof(float);
+ const size_t output_width_stride = max_pooling_op->output_pixel_stride << log2_output_element_size;
const size_t output_height_stride = output_width * output_width_stride;
const size_t multipass_adjustment = round_up(doz(pooling_size, mr), qr) + mr;
max_pooling_op->context.max_pooling = (struct max_pooling_context) {
- .indirect_input = indirection_buffer,
- .indirect_input_batch_stride = output_height * indirect_input_height_stride,
- .indirect_input_height_stride = indirect_input_height_stride,
- .output = output,
- .output_batch_stride = output_height * output_height_stride,
- .output_height_stride = output_height_stride,
- .output_width = output_width,
- .pooling_size = pooling_size,
- .channels = channels,
- .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
- .output_increment = output_width_stride - channels * sizeof(float),
- .params.f32 = max_pooling_op->f32_output_params,
- .ukernel = xnn_params.f32.maxpool.ukernel,
+ .indirect_input = max_pooling_op->indirection_buffer,
+ .indirect_input_height_stride = indirect_input_height_stride,
+ .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) max_pooling_op->last_input),
+ .input_batch_stride = (input_height * input_width * max_pooling_op->input_pixel_stride) << log2_input_element_size,
+ .output = output,
+ .output_batch_stride = output_height * output_height_stride,
+ .output_height_stride = output_height_stride,
+ .output_width = output_width,
+ .pooling_size = pooling_size,
+ .channels = channels,
+ .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
+ .output_increment = output_width_stride - (channels << log2_output_element_size),
+ .ukernel = maxpool->ukernel,
};
+ memcpy(&max_pooling_op->context.max_pooling.params, params, sizeof(max_pooling_op->context.max_pooling.params));
+
max_pooling_op->compute.type = xnn_parallelization_type_2d;
max_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_max_pooling;
max_pooling_op->compute.range[0] = batch_size;
max_pooling_op->compute.range[1] = output_height;
max_pooling_op->state = xnn_run_state_ready;
- max_pooling_op->last_input = input;
- max_pooling_op->last_input_height = input_height;
- max_pooling_op->last_input_width = input_width;
- max_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
-
return xnn_status_success;
}
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_u8(
+ xnn_operator_t max_pooling_op,
+ size_t batch_size,
+ size_t input_height,
+ size_t input_width,
+ const uint8_t* input,
+ uint8_t* output,
+ pthreadpool_t threadpool)
+{
+ if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_u8) {
+ xnn_log_error("failed to setup Max Pooling (NHWC, U8) operator: operator type mismatch");
+ return xnn_status_invalid_parameter;
+ }
+
+ return setup_max_pooling2d(
+ max_pooling_op,
+ batch_size, input_height, input_width,
+ input, output,
+ 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
+ 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
+ &xnn_params.u8.maxpool,
+ &max_pooling_op->u8_output_params,
+ pthreadpool_get_threads_count(threadpool));
+}
+
+enum xnn_status xnn_setup_max_pooling2d_nhwc_f32(
+ xnn_operator_t max_pooling_op,
+ size_t batch_size,
+ size_t input_height,
+ size_t input_width,
+ const float* input,
+ float* output,
+ pthreadpool_t threadpool)
+{
+ if (max_pooling_op->type != xnn_operator_type_max_pooling_nhwc_f32) {
+ xnn_log_error("failed to setup Max Pooling (NHWC, F32) operator: operator type mismatch");
+ return xnn_status_invalid_parameter;
+ }
+
+ return setup_max_pooling2d(
+ max_pooling_op,
+ batch_size, input_height, input_width,
+ input, output,
+ 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
+ 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
+ &xnn_params.f32.maxpool,
+ &max_pooling_op->f32_output_params,
+ pthreadpool_get_threads_count(threadpool));
+}
+
diff --git a/src/operator-run.c b/src/operator-run.c
index 28c0b5a..b4f8dbe 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -275,17 +275,17 @@
size_t batch_index,
size_t output_y)
{
- const void** indirect_input =
- (const void**) ((uintptr_t) context->indirect_input +
- batch_index * context->indirect_input_batch_stride + output_y * context->indirect_input_height_stride);
- void* output =
- (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride + output_y * context->output_height_stride);
- uint32_t* index =
- (uint32_t*) ((uintptr_t) context->index + batch_index * context->index_batch_stride + output_y * context->index_height_stride);
+ const void** indirect_input = (const void**) ((uintptr_t) context->indirect_input +
+ output_y * context->indirect_input_height_stride);
+ const size_t input_offset = context->input_offset + batch_index * context->input_batch_stride;
+ void* output = (void*) ((uintptr_t) context->output +
+ batch_index * context->output_batch_stride + output_y * context->output_height_stride);
+ uint32_t* index = (uint32_t*) ((uintptr_t) context->index +
+ batch_index * context->index_batch_stride + output_y * context->index_height_stride);
context->unipass_ukernel(
context->output_width, context->pooling_size, context->channels,
- indirect_input, output, index,
+ indirect_input, input_offset, output, index,
context->input_increment, context->output_increment,
&context->params);
}
@@ -295,20 +295,20 @@
size_t batch_index,
size_t output_y)
{
- const void** indirect_input =
- (const void**) ((uintptr_t) context->indirect_input +
- batch_index * context->indirect_input_batch_stride + output_y * context->indirect_input_height_stride);
- void* output =
- (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride + output_y * context->output_height_stride);
- uint32_t* index =
- (uint32_t*) ((uintptr_t) context->index + batch_index * context->index_batch_stride + output_y * context->index_height_stride);
+ const void** indirect_input = (const void**) ((uintptr_t) context->indirect_input +
+ output_y * context->indirect_input_height_stride);
+ const size_t input_offset = context->input_offset + batch_index * context->input_batch_stride;
+ void* output = (void*) ((uintptr_t) context->output +
+ batch_index * context->output_batch_stride + output_y * context->output_height_stride);
+ uint32_t* index = (uint32_t*) ((uintptr_t) context->index +
+ batch_index * context->index_batch_stride + output_y * context->index_height_stride);
- XNN_ALIGN(16) float multipass_output_buffer[context->channels + XNN_EXTRA_BYTES / sizeof(float)];
+ XNN_ALIGN(16) float multipass_accumulation_buffer[context->channels + XNN_EXTRA_BYTES / sizeof(float)];
XNN_ALIGN(16) uint32_t multipass_index_buffer[context->channels + XNN_EXTRA_BYTES / sizeof(uint32_t)];
context->multipass_ukernel(
context->output_width, context->pooling_size, context->channels,
- indirect_input, multipass_output_buffer, multipass_index_buffer, output, index,
+ indirect_input, input_offset, multipass_accumulation_buffer, multipass_index_buffer, output, index,
context->input_increment, context->output_increment,
&context->params);
}
@@ -318,15 +318,15 @@
size_t batch_index,
size_t output_y)
{
- const void** indirect_input =
- (const void**) ((uintptr_t) context->indirect_input +
- batch_index * context->indirect_input_batch_stride + output_y * context->indirect_input_height_stride);
- void* output =
- (void*) ((uintptr_t) context->output + batch_index * context->output_batch_stride + output_y * context->output_height_stride);
+ const void** indirect_input = (const void**) ((uintptr_t) context->indirect_input +
+ output_y * context->indirect_input_height_stride);
+ const size_t input_offset = context->input_offset + batch_index * context->input_batch_stride;
+ void* output = (void*) ((uintptr_t) context->output +
+ batch_index * context->output_batch_stride + output_y * context->output_height_stride);
context->ukernel(
context->output_width, context->pooling_size, context->channels,
- indirect_input, output,
+ indirect_input, input_offset, output,
context->input_increment, context->output_increment,
&context->params);
}
diff --git a/src/u8-maxpool/9p8q-neon.c b/src/u8-maxpool/9p8x-neon-c16.c
similarity index 75%
rename from src/u8-maxpool/9p8q-neon.c
rename to src/u8-maxpool/9p8x-neon-c16.c
index 9deaf24..8edba53 100644
--- a/src/u8-maxpool/9p8q-neon.c
+++ b/src/u8-maxpool/9p8x-neon-c16.c
@@ -13,19 +13,20 @@
#include <xnnpack/maxpool.h>
-void xnn_u8_maxpool_ukernel_9p8q__neon(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_u8_maxpool_ukernel_9p8x__neon_c16(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
const uint8_t** input,
+ size_t input_offset,
uint8_t* output,
size_t input_increment,
size_t output_increment,
const union xnn_u8_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.max);
const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.min);
@@ -41,33 +42,42 @@
const uint8_t* i6 = *input++;
const uint8_t* i7 = *input++;
const uint8_t* i8 = *input++;
- if (ks < 2) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+ i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (kernel_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (kernel_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (kernel_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (kernel_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (kernel_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (kernel_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (kernel_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
- for (; k >= 16; k -= 16) {
+ size_t c = channels;
+ for (; c >= 16; c -= 16) {
const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16;
const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16;
const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16;
@@ -90,7 +100,7 @@
vst1q_u8(o, vout); o += 16;
}
- if (k != 0) {
+ if (c != 0) {
const uint8x16_t vi0 = vld1q_u8(i0);
const uint8x16_t vi1 = vld1q_u8(i1);
const uint8x16_t vi2 = vld1q_u8(i2);
@@ -112,25 +122,25 @@
const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);
uint8x8_t vout_lo = vget_low_u8(vout);
- if (k & 8) {
+ if (c & 8) {
vst1_u8(o, vout_lo); o += 8;
vout_lo = vget_high_u8(vout);
}
- if (k & 4) {
+ if (c & 4) {
vst1_lane_u32(__builtin_assume_aligned(o, 1), vreinterpret_u32_u8(vout_lo), 0); o += 4;
vout_lo = vext_u8(vout_lo, vout_lo, 4);
}
- if (k & 2) {
+ if (c & 2) {
vst1_lane_u16(__builtin_assume_aligned(o, 1), vreinterpret_u16_u8(vout_lo), 0); o += 2;
vout_lo = vext_u8(vout_lo, vout_lo, 2);
}
- if (k & 1) {
+ if (c & 1) {
vst1_lane_u8(o, vout_lo, 0); o += 1;
}
}
}
- for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const uint8_t* i0 = *input++;
const uint8_t* i1 = *input++;
const uint8_t* i2 = *input++;
@@ -139,31 +149,39 @@
const uint8_t* i5 = *input++;
const uint8_t* i6 = *input++;
const uint8_t* i7 = *input++;
- if (m < 2) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m < 8) {
+ if (k < 8) {
i7 = i0;
}
o = output;
- size_t k = kc;
- for (; k >= 16; k -= 16) {
+ size_t c = channels;
+ for (; c >= 16; c -= 16) {
const uint8x16_t vi0 = vld1q_u8(i0); i0 += 16;
const uint8x16_t vi1 = vld1q_u8(i1); i1 += 16;
const uint8x16_t vi2 = vld1q_u8(i2); i2 += 16;
@@ -186,7 +204,7 @@
vst1q_u8(o, vout); o += 16;
}
- if (k != 0) {
+ if (c != 0) {
const uint8x16_t vi0 = vld1q_u8(i0);
const uint8x16_t vi1 = vld1q_u8(i1);
const uint8x16_t vi2 = vld1q_u8(i2);
@@ -208,24 +226,24 @@
const uint8x16_t vout = vmaxq_u8(vminq_u8(vmax, voutput_max), voutput_min);
uint8x8_t vout_lo = vget_low_u8(vout);
- if (k & 8) {
+ if (c & 8) {
vst1_u8(o, vout_lo); o += 8;
vout_lo = vget_high_u8(vout);
}
- if (k & 4) {
+ if (c & 4) {
vst1_lane_u32(__builtin_assume_aligned(o, 1), vreinterpret_u32_u8(vout_lo), 0); o += 4;
vout_lo = vext_u8(vout_lo, vout_lo, 4);
}
- if (k & 2) {
+ if (c & 2) {
vst1_lane_u16(__builtin_assume_aligned(o, 1), vreinterpret_u16_u8(vout_lo), 0); o += 2;
vout_lo = vext_u8(vout_lo, vout_lo, 2);
}
- if (k & 1) {
+ if (c & 1) {
vst1_lane_u8(o, vout_lo, 0); o += 1;
}
}
}
input = (const uint8_t**) ((uintptr_t) input + input_increment);
output = (uint8_t*) ((uintptr_t) o + output_increment);
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/u8-maxpool/9p8q-scalar.c b/src/u8-maxpool/9p8x-scalar-c1.c
similarity index 65%
rename from src/u8-maxpool/9p8q-scalar.c
rename to src/u8-maxpool/9p8x-scalar-c1.c
index e442f75..10112a8 100644
--- a/src/u8-maxpool/9p8q-scalar.c
+++ b/src/u8-maxpool/9p8x-scalar-c1.c
@@ -8,19 +8,20 @@
#include <xnnpack/maxpool.h>
-void xnn_u8_maxpool_ukernel_9p8q__scalar(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_u8_maxpool_ukernel_9p8x__scalar_c1(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
const uint8_t** input,
+ size_t input_offset,
uint8_t* output,
size_t input_increment,
size_t output_increment,
const union xnn_u8_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
const uint8_t voutput_max = params->scalar.max;
const uint8_t voutput_min = params->scalar.min;
@@ -36,32 +37,41 @@
const uint8_t* i6 = *input++;
const uint8_t* i7 = *input++;
const uint8_t* i8 = *input++;
- if (ks < 2) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+ i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (kernel_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (kernel_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (kernel_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (kernel_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (kernel_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (kernel_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (kernel_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
+ size_t c = channels;
do {
const uint8_t vi0 = *i0++;
const uint8_t vi1 = *i1++;
@@ -87,10 +97,10 @@
vout = vout < voutput_min ? voutput_min : vout;
*o++ = vout;
- } while (--k != 0);
+ } while (--c != 0);
}
- for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const uint8_t* i0 = *input++;
const uint8_t* i1 = *input++;
const uint8_t* i2 = *input++;
@@ -99,30 +109,38 @@
const uint8_t* i5 = *input++;
const uint8_t* i6 = *input++;
const uint8_t* i7 = *input++;
- if (m < 2) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m < 8) {
+ if (k < 8) {
i7 = i0;
}
o = output;
- size_t k = kc;
+ size_t c = channels;
do {
const uint8_t vi0 = *i0++;
const uint8_t vi1 = *i1++;
@@ -148,9 +166,9 @@
vout = vout < voutput_min ? voutput_min : vout;
*o++ = vout;
- } while (--k != 0);
+ } while (--c != 0);
}
input = (const uint8_t**) ((uintptr_t) input + input_increment);
output = (uint8_t*) ((uintptr_t) o + output_increment);
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/u8-maxpool/9p8q-sse2.c b/src/u8-maxpool/9p8x-sse2-c16.c
similarity index 78%
rename from src/u8-maxpool/9p8q-sse2.c
rename to src/u8-maxpool/9p8x-sse2-c16.c
index 8903457..7d3ee15 100644
--- a/src/u8-maxpool/9p8q-sse2.c
+++ b/src/u8-maxpool/9p8x-sse2-c16.c
@@ -13,19 +13,20 @@
#include <xnnpack/maxpool.h>
-void xnn_u8_maxpool_ukernel_9p8q__sse2(
- size_t n,
- size_t ks,
- size_t kc,
+void xnn_u8_maxpool_ukernel_9p8x__sse2_c16(
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
const uint8_t** input,
+ size_t input_offset,
uint8_t* output,
size_t input_increment,
size_t output_increment,
const union xnn_u8_output_params params[restrict static 1])
{
- assert(n != 0);
- assert(ks != 0);
- assert(kc != 0);
+ assert(output_pixels != 0);
+ assert(kernel_elements != 0);
+ assert(channels != 0);
const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.max);
const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.min);
@@ -42,33 +43,42 @@
const uint8_t* i6 = *input++;
const uint8_t* i7 = *input++;
const uint8_t* i8 = *input++;
- if (ks < 2) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+ i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
+ if (kernel_elements < 2) {
i1 = i0;
}
- if (ks <= 2) {
+ if (kernel_elements <= 2) {
i2 = i0;
}
- if (ks < 4) {
+ if (kernel_elements < 4) {
i3 = i0;
}
- if (ks <= 4) {
+ if (kernel_elements <= 4) {
i4 = i0;
}
- if (ks < 6) {
+ if (kernel_elements < 6) {
i5 = i0;
}
- if (ks <= 6) {
+ if (kernel_elements <= 6) {
i6 = i0;
}
- if (ks < 8) {
+ if (kernel_elements < 8) {
i7 = i0;
}
- if (ks <= 8) {
+ if (kernel_elements <= 8) {
i8 = i0;
}
- size_t k = kc;
- for (; k >= 16; k -= 16) {
+ size_t c = channels;
+ for (; c >= 16; c -= 16) {
const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
@@ -91,7 +101,7 @@
_mm_storeu_si128((__m128i*) o, vout); o += 16;
}
- if (k != 0) {
+ if (c != 0) {
const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
@@ -112,29 +122,29 @@
const __m128i vmax = _mm_max_epu8(vmax2345, vmax01678);
__m128i vout = _mm_max_epu8(_mm_min_epu8(vmax, voutput_max), voutput_min);
- if (k & 8) {
+ if (c & 8) {
_mm_storel_epi64((__m128i*) o, vout);
vout = _mm_unpackhi_epi64(vout, vout);
o += 8;
}
- if (k & 4) {
+ if (c & 4) {
*((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vout);
vout = _mm_srli_epi64(vout, 32);
o += 4;
}
- if (k & 2) {
+ if (c & 2) {
*((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vout, 0);
vout = _mm_srli_epi32(vout, 16);
o += 2;
}
- if (k & 1) {
+ if (c & 1) {
*((uint8_t*) o) = (uint8_t) _mm_cvtsi128_si32(vout);
o += 1;
}
}
}
- for (ptrdiff_t m = (ptrdiff_t) ks - 9; m > 0; m -= 8) {
+ for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
const uint8_t* i0 = *input++;
const uint8_t* i1 = *input++;
const uint8_t* i2 = *input++;
@@ -143,31 +153,39 @@
const uint8_t* i5 = *input++;
const uint8_t* i6 = *input++;
const uint8_t* i7 = *input++;
- if (m < 2) {
+ i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
+ i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
+ i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
+ i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
+ i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
+ i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
+ i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
+ i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
+ if (k < 2) {
i1 = i0;
}
- if (m <= 2) {
+ if (k <= 2) {
i2 = i0;
}
- if (m < 4) {
+ if (k < 4) {
i3 = i0;
}
- if (m <= 4) {
+ if (k <= 4) {
i4 = i0;
}
- if (m < 6) {
+ if (k < 6) {
i5 = i0;
}
- if (m <= 6) {
+ if (k <= 6) {
i6 = i0;
}
- if (m < 8) {
+ if (k < 8) {
i7 = i0;
}
o = output;
- size_t k = kc;
- for (; k >= 16; k -= 16) {
+ size_t c = channels;
+ for (; c >= 16; c -= 16) {
const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0); i0 += 16;
const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1); i1 += 16;
const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2); i2 += 16;
@@ -191,7 +209,7 @@
_mm_storeu_si128((__m128i*) o, vout);
o += 16;
}
- if (k != 0) {
+ if (c != 0) {
const __m128i vi0 = _mm_loadu_si128((const __m128i*) i0);
const __m128i vi1 = _mm_loadu_si128((const __m128i*) i1);
const __m128i vi2 = _mm_loadu_si128((const __m128i*) i2);
@@ -212,22 +230,22 @@
const __m128i vmax = _mm_max_epu8(vmax2345, vmax0167);
__m128i vout = _mm_max_epu8(_mm_min_epu8(vmax, voutput_max), voutput_min);
- if (k & 8) {
+ if (c & 8) {
_mm_storel_epi64((__m128i*) o, vout);
vout = _mm_unpackhi_epi64(vout, vout);
o += 8;
}
- if (k & 4) {
+ if (c & 4) {
*((uint32_t*) o) = (uint32_t) _mm_cvtsi128_si32(vout);
vout = _mm_srli_epi64(vout, 32);
o += 4;
}
- if (k & 2) {
+ if (c & 2) {
*((uint16_t*) o) = (uint16_t) _mm_extract_epi16(vout, 0);
vout = _mm_srli_epi32(vout, 16);
o += 2;
}
- if (k & 1) {
+ if (c & 1) {
*((uint8_t*) o) = (uint8_t) _mm_cvtsi128_si32(vout);
o += 1;
}
@@ -235,5 +253,5 @@
}
input = (const uint8_t**) ((uintptr_t) input + input_increment);
output = (uint8_t*) ((uintptr_t) o + output_increment);
- } while (--n != 0);
+ } while (--output_pixels != 0);
}
diff --git a/src/xnnpack/argmaxpool.h b/src/xnnpack/argmaxpool.h
index ce60230..056114c 100644
--- a/src/xnnpack/argmaxpool.h
+++ b/src/xnnpack/argmaxpool.h
@@ -18,41 +18,44 @@
#define DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
- size_t n, \
- size_t ks, \
- size_t kc, \
- const float** x, \
- float* y, \
- uint32_t* i, \
- size_t x_increment, \
- size_t y_increment, \
+ size_t output_pixels, \
+ size_t kernel_elements, \
+ size_t channels, \
+ const float** input, \
+ size_t input_offset, \
+ float* output, \
+ uint32_t* index, \
+ size_t input_increment, \
+ size_t output_increment, \
const union xnn_f32_output_params* params);
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__psimd)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__scalar)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up4__sse2)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__psimd)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__scalar)
-DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_up9__sse2)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_4x__psimd_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_4x__sse2_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_4x__scalar_c1)
+
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9x__psimd_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9x__sse2_c4)
+DECLARE_F32_ARGMAXPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9x__scalar_c1)
#define DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
- size_t n, \
- size_t ks, \
- size_t kc, \
- const float** x, \
- float* ab, \
- uint32_t* ib, \
- float* y, \
- uint32_t* i, \
- size_t x_increment, \
- size_t y_increment, \
+ size_t output_pixels, \
+ size_t kernel_elements, \
+ size_t channels, \
+ const float** input, \
+ size_t input_offset, \
+ float* accumulation_buffer, \
+ uint32_t* index_buffer, \
+ float* output, \
+ uint32_t* index, \
+ size_t input_increment, \
+ size_t output_increment, \
const union xnn_f32_output_params* params);
-DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd)
-DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar)
-DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4)
+DECLARE_F32_ARGMAXPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1)
#ifdef __cplusplus
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 2f7e31a..8eb4fa4 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -292,8 +292,9 @@
struct max_pooling_context {
const void** indirect_input;
- size_t indirect_input_batch_stride;
size_t indirect_input_height_stride;
+ size_t input_offset;
+ size_t input_batch_stride;
void* output;
size_t output_batch_stride;
size_t output_height_stride;
@@ -341,8 +342,9 @@
struct argmax_pooling_context {
const void** indirect_input;
- size_t indirect_input_batch_stride;
size_t indirect_input_height_stride;
+ size_t input_offset;
+ size_t input_batch_stride;
void* output;
size_t output_batch_stride;
size_t output_height_stride;
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
index d9fd0ed..868da5c 100644
--- a/src/xnnpack/indirection.h
+++ b/src/xnnpack/indirection.h
@@ -43,7 +43,6 @@
XNN_INTERNAL void xnn_indirection_init_maxpool2d(
xnn_operator_t op,
- size_t batch_start,
size_t step_height,
size_t step_width,
uint32_t log2_element_size);
diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index 1cac764..6013cba 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h
@@ -21,34 +21,36 @@
#define DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
- size_t n, \
- size_t ks, \
- size_t kc, \
- const float** x, \
- float* y, \
- size_t x_increment, \
- size_t y_increment, \
+ size_t output_pixels, \
+ size_t kernel_size, \
+ size_t channels, \
+ const float** input, \
+ size_t input_offset, \
+ float* output, \
+ size_t input_increment, \
+ size_t output_increment, \
const union xnn_f32_output_params* params);
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__psimd)
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__scalar)
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8q__sse)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__psimd_c4)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__sse_c4)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__scalar_c1)
#define DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
- size_t n, \
- size_t ks, \
- size_t kc, \
- const uint8_t** x, \
- uint8_t* y, \
- size_t x_increment, \
- size_t y_increment, \
+ size_t output_pixels, \
+ size_t kernel_size, \
+ size_t channels, \
+ const uint8_t** input, \
+ size_t input_offset, \
+ uint8_t* output, \
+ size_t input_increment, \
+ size_t output_increment, \
const union xnn_u8_output_params* params);
-DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__neon)
-DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__sse2)
-DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8q__scalar)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8x__neon_c16)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8x__sse2_c16)
+DECLARE_U8_MAXPOOL_UKERNEL_FUNCTION(xnn_u8_maxpool_ukernel_9p8x__scalar_c1)
#ifdef __cplusplus
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 88e56d0..c76315e 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -926,81 +926,88 @@
const union xnn_f32_output_params* params);
typedef void (*xnn_maxpool_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const void** x,
- void* y,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const void** input,
+ size_t input_offset,
+ void* output,
+ size_t input_increment,
+ size_t output_increment,
const void* params);
typedef void (*xnn_f32_maxpool_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const float** x,
- float* y,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const float** input,
+ size_t input_offset,
+ float* output,
+ size_t input_increment,
+ size_t output_increment,
const union xnn_f32_output_params* params);
typedef void (*xnn_u8_maxpool_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const uint8_t** x,
- uint8_t* y,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const uint8_t** input,
+ size_t input_offset,
+ uint8_t* output,
+ size_t input_increment,
+ size_t output_increment,
const union xnn_u8_output_params* params);
typedef void (*xnn_argmaxpool_up_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const void** x,
- void* y,
- uint32_t* i,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const void** input,
+ size_t input_offset,
+ void* output,
+ uint32_t* index,
+ size_t input_increment,
+ size_t output_increment,
const void* params);
typedef void (*xnn_f32_argmaxpool_up_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const float** x,
- float* y,
- uint32_t* i,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const float** input,
+ size_t input_offset,
+ float* output,
+ uint32_t* index,
+ size_t input_increment,
+ size_t output_increment,
const union xnn_f32_output_params* params);
typedef void (*xnn_argmaxpool_mp_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const void** x,
- void* ab,
- uint32_t* ib,
- void* y,
- uint32_t* i,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const void** input,
+ size_t input_offset,
+ void* accumulation_buffer,
+ uint32_t* index_buffer,
+ void* output,
+ uint32_t* index,
+ size_t input_increment,
+ size_t output_increment,
const void* params);
typedef void (*xnn_f32_argmaxpool_mp_ukernel_function)(
- size_t n,
- size_t ks,
- size_t kc,
- const float** x,
- float* ab,
- uint32_t* ib,
- float* y,
- uint32_t* i,
- size_t x_increment,
- size_t y_increment,
+ size_t output_pixels,
+ size_t kernel_elements,
+ size_t channels,
+ const float** input,
+ size_t input_offset,
+ float* accumulation_buffer,
+ uint32_t* index_buffer,
+ float* output,
+ uint32_t* index,
+ size_t input_increment,
+ size_t output_increment,
const union xnn_f32_output_params* params);
typedef void (*xnn_univector_ukernel_function)(
diff --git a/test/argmaxpool-microkernel-tester.h b/test/argmaxpool-microkernel-tester.h
index e84d11a..562d176 100644
--- a/test/argmaxpool-microkernel-tester.h
+++ b/test/argmaxpool-microkernel-tester.h
@@ -21,126 +21,121 @@
#include <xnnpack/params.h>
-class ArgmaxPoolMicrokernelTester {
+class ArgMaxPoolMicrokernelTester {
public:
enum class Variant {
Native,
Scalar,
};
- inline ArgmaxPoolMicrokernelTester& n(size_t n) {
- assert(n != 0);
- this->n_ = n;
+ inline ArgMaxPoolMicrokernelTester& output_pixels(size_t output_pixels) {
+ assert(output_pixels != 0);
+ this->output_pixels_ = output_pixels;
return *this;
}
- inline size_t n() const {
- return this->n_;
+ inline size_t output_pixels() const {
+ return this->output_pixels_;
}
- inline ArgmaxPoolMicrokernelTester& s(size_t s) {
- assert(s != 0);
- this->s_ = s;
+ inline ArgMaxPoolMicrokernelTester& step(size_t step) {
+ assert(step != 0);
+ this->step_ = step;
return *this;
}
- inline size_t s() const {
- return this->s_;
+ inline size_t step() const {
+ return this->step_;
}
- inline ArgmaxPoolMicrokernelTester& kh(size_t kh) {
- assert(kh != 0);
- this->kh_ = kh;
+ inline ArgMaxPoolMicrokernelTester& input_offset(size_t input_offset) {
+ assert(input_offset != 0);
+ this->input_offset_ = input_offset;
return *this;
}
- inline size_t kh() const {
- return this->kh_;
+ inline size_t input_offset() const {
+ return this->input_offset_;
}
- inline ArgmaxPoolMicrokernelTester& kw(size_t kw) {
- assert(kw != 0);
- this->kw_ = kw;
+ inline ArgMaxPoolMicrokernelTester& pooling_elements(size_t pooling_elements) {
+ assert(pooling_elements != 0);
+ this->pooling_elements_ = pooling_elements;
return *this;
}
- inline size_t kw() const {
- return this->kw_;
+ inline size_t pooling_elements() const {
+ return this->pooling_elements_;
}
- inline size_t ks() const {
- return kh() * kw();
- }
-
- inline size_t packed_ks() const {
- if (ks() <= mr()) {
- return mr();
+ inline size_t packed_pooling_elements() const {
+ if (pooling_elements() <= primary_pooling_tile()) {
+ return primary_pooling_tile();
} else {
- return (ks() - mr()) % qr() == 0 ? ks() : ((ks() - mr()) / qr() + 1) * qr() + mr();
+ return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile();
}
}
- inline ArgmaxPoolMicrokernelTester& mr(size_t mr) {
- assert(mr != 0);
- this->mr_ = mr;
+ inline ArgMaxPoolMicrokernelTester& pooling_tile(size_t primary_tile) {
+ assert(primary_tile != 0);
+ this->primary_pooling_tile_ = primary_tile;
+ this->incremental_pooling_tile_ = 0;
return *this;
}
- inline size_t mr() const {
- return this->mr_;
- }
-
- inline ArgmaxPoolMicrokernelTester& qr(size_t qr) {
- assert(qr != 0);
- this->qr_ = qr;
+ inline ArgMaxPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile) {
+ assert(primary_tile != 0);
+ this->primary_pooling_tile_ = primary_tile;
+ this->incremental_pooling_tile_ = incremental_tile;
return *this;
}
- inline size_t qr() const {
- return this->qr_;
- }
-
- inline ArgmaxPoolMicrokernelTester& kc(size_t kc) {
- assert(kc != 0);
- this->kc_ = kc;
+ inline ArgMaxPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) {
+ assert(primary_pooling_tile != 0);
+ this->primary_pooling_tile_ = primary_pooling_tile;
return *this;
}
- inline size_t kc() const {
- return this->kc_;
+ inline size_t primary_pooling_tile() const {
+ return this->primary_pooling_tile_;
}
- inline ArgmaxPoolMicrokernelTester& x_stride(size_t x_stride) {
- assert(x_stride != 0);
- this->x_stride_ = x_stride;
+ inline ArgMaxPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) {
+ assert(incremental_pooling_tile != 0);
+ this->incremental_pooling_tile_ = incremental_pooling_tile;
return *this;
}
- inline size_t x_stride() const {
- if (this->x_stride_ == 0) {
- return kc();
+ inline size_t incremental_pooling_tile() const {
+ return this->incremental_pooling_tile_;
+ }
+
+ inline ArgMaxPoolMicrokernelTester& channels(size_t channels) {
+ assert(channels != 0);
+ this->channels_ = channels;
+ return *this;
+ }
+
+ inline size_t channels() const {
+ return this->channels_;
+ }
+
+ inline ArgMaxPoolMicrokernelTester& output_stride(size_t output_stride) {
+ assert(output_stride != 0);
+ this->output_stride_ = output_stride;
+ return *this;
+ }
+
+ inline size_t output_stride() const {
+ if (this->output_stride_ == 0) {
+ return channels();
} else {
- assert(this->x_stride_ >= kc());
- return this->x_stride_;
+ assert(this->output_stride_ >= channels());
+ return this->output_stride_;
}
}
- inline ArgmaxPoolMicrokernelTester& y_stride(size_t y_stride) {
- assert(y_stride != 0);
- this->y_stride_ = y_stride;
- return *this;
- }
-
- inline size_t y_stride() const {
- if (this->y_stride_ == 0) {
- return kc();
- } else {
- assert(this->y_stride_ >= kc());
- return this->y_stride_;
- }
- }
-
- inline ArgmaxPoolMicrokernelTester& qmin(uint8_t qmin) {
+ inline ArgMaxPoolMicrokernelTester& qmin(uint8_t qmin) {
this->qmin_ = qmin;
return *this;
}
@@ -149,7 +144,7 @@
return this->qmin_;
}
- inline ArgmaxPoolMicrokernelTester& qmax(uint8_t qmax) {
+ inline ArgMaxPoolMicrokernelTester& qmax(uint8_t qmax) {
this->qmax_ = qmax;
return *this;
}
@@ -158,7 +153,7 @@
return this->qmax_;
}
- inline ArgmaxPoolMicrokernelTester& iterations(size_t iterations) {
+ inline ArgMaxPoolMicrokernelTester& iterations(size_t iterations) {
this->iterations_ = iterations;
return *this;
}
@@ -172,87 +167,95 @@
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
- std::vector<const float*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
- std::vector<float> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-
- std::vector<float> y((n() - 1) * y_stride() + kc());
- std::vector<uint32_t> i(n() * kc());
- std::vector<float> y_ref(n() * kc());
- std::vector<uint32_t> i_ref(n() * kc());
+ std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+ std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+ ((output_pixels() - 1) * step() + pooling_elements()) * channels());
+ std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
+ std::vector<uint32_t> index(output_pixels() * channels());
+ std::vector<float> output_ref(output_pixels() * channels());
+ std::vector<uint32_t> index_ref(output_pixels() * channels());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- std::generate(x.begin(), x.end(), std::ref(f32rng));
- std::fill(y.begin(), y.end(), nanf(""));
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
+ std::fill(output.begin(), output.end(), nanf(""));
- for (size_t p = 0; p < indirect_x.size(); p++) {
- indirect_x[p] = x.data() + p * x_stride();
+ for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+ indirect_input[i] = input.data() + i * channels() - input_offset();
}
- std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+ std::shuffle(indirect_input.begin(),
+ indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
// Compute reference results, without clamping.
- for (size_t p = 0; p < n(); p++) {
- for (size_t k = 0; k < kc(); k++) {
- float max_value = indirect_x[p * s() * kh()][k];
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ float max_value = indirect_input[x * step()][c + input_offset()];
uint32_t max_index = 0;
- for (size_t j = 1; j < ks(); j++) {
- const float value = indirect_x[p * s() * kh() + j][k];
+ for (size_t p = 0; p < pooling_elements(); p++) {
+ const float value = indirect_input[x * step() + p][c + input_offset()];
if (value > max_value) {
max_value = value;
- max_index = j;
+ max_index = p;
}
}
- y_ref[p * kc() + k] = max_value;
- i_ref[p * kc() + k] = max_index;
+ output_ref[x * channels() + c] = max_value;
+ index_ref[x * channels() + c] = max_index;
}
}
// Compute clamping parameters.
- const float accumulated_min = *std::min_element(y_ref.cbegin(), y_ref.cend());
- const float accumulated_max = *std::max_element(y_ref.cbegin(), y_ref.cend());
+ const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
const float accumulated_range = accumulated_max - accumulated_min;
- const float y_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
- const float y_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
+ const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
+ const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
// Prepare output parameters.
xnn_f32_output_params output_params = { };
switch (variant) {
case Variant::Native:
- output_params = xnn_init_f32_output_params(y_min, y_max);
+ output_params = xnn_init_f32_output_params(output_min, output_max);
break;
case Variant::Scalar:
- output_params = xnn_init_scalar_f32_output_params(y_min, y_max);
+ output_params = xnn_init_scalar_f32_output_params(output_min, output_max);
break;
}
// Clamp reference results.
- for (float& y_value : y_ref) {
- y_value = std::max(std::min(y_value, y_max), y_min);
+ for (float& output_value : output_ref) {
+ output_value = std::max(std::min(output_value, output_max), output_min);
}
// Call optimized micro-kernel.
- argmaxpool(n(), ks(), kc(),
- indirect_x.data(), y.data(), i.data(),
- kh() * s() * sizeof(void*),
- (y_stride() - kc()) * sizeof(float),
+ argmaxpool(output_pixels(), pooling_elements(), channels(),
+ indirect_input.data(), input_offset() * sizeof(float), output.data(), index.data(),
+ step() * sizeof(void*),
+ (output_stride() - channels()) * sizeof(float),
&output_params);
// Verify results.
- for (size_t p = 0; p < n(); p++) {
- for (size_t k = 0; k < kc(); k++) {
- ASSERT_GE(y[p * y_stride() + k], y_min)
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_LE(y[p * y_stride() + k], y_max)
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_EQ(y_ref[p * kc() + k], y[p * y_stride() + k])
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_EQ(indirect_x[p * s() * kh() + i_ref[p * kc() + k]][k], indirect_x[p * s() * kh() + i[p * kc() + k]][k])
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_EQ(i_ref[p * kc() + k], i[p * kc() + k])
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_GE(output[x * output_stride() + c], output_min)
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_LE(output[x * output_stride() + c], output_max)
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(output_ref[x * channels() + c], output[x * output_stride() + c])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(
+ indirect_input[x * step() + index_ref[x * channels() + c]][c + input_offset()],
+ indirect_input[x * step() + index[x * channels() + c]][c + input_offset()])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(index_ref[x * channels() + c], index[x * channels() + c])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
}
}
}
@@ -263,105 +266,116 @@
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
- std::vector<const float*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
- std::vector<float> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-
- std::vector<float> y((n() - 1) * y_stride() + kc());
- std::vector<uint32_t> i(n() * kc());
- std::vector<uint32_t, AlignedAllocator<uint32_t, XNN_EXTRA_BYTES>> ib(kc() + XNN_EXTRA_BYTES / sizeof(uint32_t));
- std::vector<float, AlignedAllocator<float, XNN_EXTRA_BYTES>> yb(kc() + XNN_EXTRA_BYTES / sizeof(float));
- std::vector<float> y_ref(n() * kc());
- std::vector<uint32_t> i_ref(n() * kc());
+ std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+ std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+ ((output_pixels() - 1) * step() + pooling_elements()) * channels());
+ std::vector<float> output((output_pixels() - 1) * output_stride() + channels());
+ std::vector<uint32_t> index(output_pixels() * channels());
+ std::vector<uint32_t, AlignedAllocator<uint32_t, XNN_EXTRA_BYTES>> index_buffer(
+ channels() + XNN_EXTRA_BYTES / sizeof(uint32_t));
+ std::vector<float, AlignedAllocator<float, XNN_EXTRA_BYTES>> output_buffer(
+ channels() + XNN_EXTRA_BYTES / sizeof(float));
+ std::vector<float> output_ref(output_pixels() * channels());
+ std::vector<uint32_t> index_ref(output_pixels() * channels());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- std::generate(x.begin(), x.end(), std::ref(f32rng));
- std::fill(y.begin(), y.end(), nanf(""));
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
+ std::fill(output.begin(), output.end(), nanf(""));
- for (size_t p = 0; p < indirect_x.size(); p++) {
- indirect_x[p] = x.data() + p * x_stride();
+ for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+ indirect_input[i] = input.data() + i * channels() - input_offset();
}
- std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+ std::shuffle(indirect_input.begin(),
+ indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
// Compute reference results, without clamping.
- for (size_t p = 0; p < n(); p++) {
- for (size_t k = 0; k < kc(); k++) {
- float max_value = indirect_x[p * s() * kh()][k];
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ float max_value = indirect_input[x * step()][c + input_offset()];
uint32_t max_index = 0;
- for (size_t j = 1; j < ks(); j++) {
- const float value = indirect_x[p * s() * kh() + j][k];
+ for (size_t p = 0; p < pooling_elements(); p++) {
+ const float value = indirect_input[x * step() + p][c + input_offset()];
if (value > max_value) {
max_value = value;
- max_index = j;
+ max_index = p;
}
}
- y_ref[p * kc() + k] = max_value;
- i_ref[p * kc() + k] = max_index;
+ output_ref[x * channels() + c] = max_value;
+ index_ref[x * channels() + c] = max_index;
}
}
// Compute clamping parameters.
- const float accumulated_min = *std::min_element(y_ref.cbegin(), y_ref.cend());
- const float accumulated_max = *std::max_element(y_ref.cbegin(), y_ref.cend());
+ const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
const float accumulated_range = accumulated_max - accumulated_min;
- const float y_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
- const float y_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
+ const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
+ const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
// Prepare output parameters.
xnn_f32_output_params output_params = { };
switch (variant) {
case Variant::Native:
- output_params = xnn_init_f32_output_params(y_min, y_max);
+ output_params = xnn_init_f32_output_params(output_min, output_max);
break;
case Variant::Scalar:
- output_params = xnn_init_scalar_f32_output_params(y_min, y_max);
+ output_params = xnn_init_scalar_f32_output_params(output_min, output_max);
break;
}
// Clamp reference results.
- for (float& y_value : y_ref) {
- y_value = std::max(std::min(y_value, y_max), y_min);
+ for (float& output_value : output_ref) {
+ output_value = std::max(std::min(output_value, output_max), output_min);
}
// Call optimized micro-kernel.
- argmaxpool(n(), ks(), kc(),
- indirect_x.data(), yb.data(), ib.data(), y.data(), i.data(),
- (kh() * s() - (packed_ks() - qr())) * sizeof(void*),
- (y_stride() - kc()) * sizeof(float),
+ argmaxpool(output_pixels(), pooling_elements(), channels(),
+ indirect_input.data(), input_offset() * sizeof(float),
+ output_buffer.data(), index_buffer.data(),
+ output.data(), index.data(),
+ (step() - (packed_pooling_elements() - incremental_pooling_tile())) * sizeof(void*),
+ (output_stride() - channels()) * sizeof(float),
&output_params);
// Verify results.
- for (size_t p = 0; p < n(); p++) {
- for (size_t k = 0; k < kc(); k++) {
- ASSERT_GE(y[p * y_stride() + k], y_min)
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_LE(y[p * y_stride() + k], y_max)
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_EQ(y_ref[p * kc() + k], y[p * y_stride() + k])
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_EQ(indirect_x[p * s() * kh() + i_ref[p * kc() + k]][k], indirect_x[p * s() * kh() + i[p * kc() + k]][k])
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
- ASSERT_EQ(i_ref[p * kc() + k], i[p * kc() + k])
- << "at pixel " << p << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_GE(output[x * output_stride() + c], output_min)
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_LE(output[x * output_stride() + c], output_max)
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(output_ref[x * channels() + c], output[x * output_stride() + c])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(
+ indirect_input[x * step() + index_ref[x * channels() + c]][c + input_offset()],
+ indirect_input[x * step() + index[x * channels() + c]][c + input_offset()])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(index_ref[x * channels() + c], index[x * channels() + c])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
}
}
}
}
private:
- size_t n_{1};
- size_t s_{1};
- size_t kh_{1};
- size_t kw_{1};
- size_t mr_{1};
- size_t qr_{1};
- size_t kc_{1};
- size_t x_stride_{0};
- size_t y_stride_{0};
+ size_t output_pixels_{1};
+ size_t pooling_elements_{1};
+ size_t channels_{1};
+ size_t input_offset_{0};
+ size_t step_{1};
+ size_t primary_pooling_tile_{1};
+ size_t incremental_pooling_tile_{1};
+ size_t output_stride_{0};
uint8_t qmin_{0};
uint8_t qmax_{255};
- size_t iterations_{15};
+ size_t iterations_{3};
};
diff --git a/test/f32-argmaxpool.cc b/test/f32-argmaxpool.cc
index c1a8a91..1ec2610 100644
--- a/test/f32-argmaxpool.cc
+++ b/test/f32-argmaxpool.cc
@@ -2,6 +2,11 @@
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/f32-argmaxpool.yaml
+// Generator: tools/generate-argmaxpool-test.py
+
#include <gtest/gtest.h>
@@ -13,1309 +18,388 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_eq_4_fulltile) {
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile) {
TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_div_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
}
}
}
}
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_eq_4_subtile) {
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_input_offset) {
TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .kc(4);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
}
}
}
}
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_fulltile) {
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_qmin) {
TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
}
}
}
}
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_subtile) {
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_qmax) {
TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
}
}
}
}
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_fulltile_with_x_stride) {
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_output_stride) {
TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
}
}
}
}
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_lt_4_fulltile) {
+ TEST(F32_ARGMAXPOOL_4X__SSE2_C4, few_output_pixels_with_step) {
TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_lt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_lt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_gt_4_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_gt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_with_y_max) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .qmax(128)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, kc_div_4_with_y_min) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .qmin(128)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n_with_y_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__SSE2, small_n_with_s) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_eq_4_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_eq_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .kc(4);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_lt_4_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_lt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_lt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_gt_4_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_gt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_with_y_max) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(3)
- .kw(3)
- .kc(kc)
- .qmax(128)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, kc_div_4_with_y_min) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(3)
- .kw(3)
- .kc(kc)
- .qmin(128)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n_with_y_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__SSE2, small_n_with_s) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .Test(xnn_f32_argmaxpool_ukernel_up9__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_multipass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_eq_4_multipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = 17;
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_multipass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_multipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_multipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_multipass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_multipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_lt_4_multipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_multipass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_multipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_gt_4_multipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_with_y_max) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(5)
- .kw(5)
- .kc(kc)
- .qmax(128)
- .iterations(3)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, kc_div_4_with_y_min) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(5)
- .kw(5)
- .kc(kc)
- .qmin(128)
- .iterations(3)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n_with_y_stride) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__SSE2, small_n_with_s) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t s = 2; s <= 5; s++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__sse2);
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__sse2_c4);
}
}
}
@@ -1324,2061 +408,3144 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_eq_4_fulltile) {
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile) {
TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_div_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_eq_4_subtile) {
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_input_offset) {
TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .kc(4);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_4X__PSIMD_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
}
+#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(4)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
+}
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 4; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
+}
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_lt_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_lt_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_lt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_gt_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_gt_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_gt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_with_y_max) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .qmax(128)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, kc_div_4_with_y_min) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .qmin(128)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n_with_y_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP4__PSIMD, small_n_with_s) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_eq_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_eq_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .kc(4);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_lt_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_lt_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_lt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_gt_4_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_gt_4_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_gt_4_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_with_y_max) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(3)
- .kw(3)
- .kc(kc)
- .qmax(128)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, kc_div_4_with_y_min) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(3)
- .kw(3)
- .kc(kc)
- .qmin(128)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n_with_y_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_UP9__PSIMD, small_n_with_s) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .Test(xnn_f32_argmaxpool_ukernel_up9__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_multipass_fulltile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_eq_4_multipass_subtile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = 17;
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_multipass_fulltile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_multipass_subtile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_multipass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_multipass_fulltile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_multipass_subtile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_lt_4_multipass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_multipass_fulltile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_multipass_subtile) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_gt_4_multipass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(23)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_with_y_max) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(5)
- .kw(5)
- .kc(kc)
- .qmax(128)
- .iterations(3)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, kc_div_4_with_y_min) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(5)
- .kw(5)
- .kc(kc)
- .qmin(128)
- .iterations(3)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n_with_y_stride) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(F32_ARGMAXPOOL_MP9P8Q__PSIMD, small_n_with_s) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t s = 2; s <= 5; s++) {
- for (size_t kc = 8; kc < 25; kc += 5) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__psimd, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
-
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_eq_1_fulltile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_eq_1_subtile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .kc(1);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_input_offset) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_qmin) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_qmax) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_output_stride) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_4X__SCALAR_C1, few_output_pixels_with_step) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 4; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(4)
+ .step(step)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_argmaxpool_ukernel_4x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
}
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_gt_1_fulltile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_div_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_gt_1_subtile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__SSE2_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__sse2_c4);
}
}
}
}
}
-}
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, kc_gt_1_fulltile_with_x_stride) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(4)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_eq_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_div_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, channels_gt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, y_max) {
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 1; kc < 16; kc++) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .qmax(128)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, y_min) {
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 1; kc < 16; kc++) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .qmin(128)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n_with_x_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n_with_y_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP4__SCALAR, small_n_with_s) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(2)
- .kw(2)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(1)
- .kw(3)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- ArgmaxPoolMicrokernelTester()
- .mr(4)
- .n(n)
- .kh(3)
- .kw(1)
- .kc(kc)
- .s(2)
- .Test(xnn_f32_argmaxpool_ukernel_up4__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_eq_1_fulltile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_eq_1_subtile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .kc(1);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_gt_1_fulltile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_gt_1_subtile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9X__PSIMD_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
}
+#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, kc_gt_1_fulltile_with_x_stride) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_input_offset) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_qmin) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_qmax) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_output_stride) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9X__SCALAR_C1, few_output_pixels_with_step) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 2; pooling_elements <= 9; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9)
+ .step(step)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, y_max) {
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 1; kc < 16; kc++) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(3)
- .kw(3)
- .kc(kc)
- .qmax(128)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, y_min) {
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 1; kc < 16; kc++) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(3)
- .kw(3)
- .kc(kc)
- .qmin(128)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n_with_x_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n_with_y_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
-}
-TEST(F32_ARGMAXPOOL_UP9__SCALAR, small_n_with_s) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- for (size_t s = 2; s <= ks; s++) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .Test(xnn_f32_argmaxpool_ukernel_up9__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_eq_4_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_div_4_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_lt_4_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, channels_gt_4_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_twopass_fulltile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_twopass_subtile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_multipass_fulltile) {
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_eq_1_multipass_subtile) {
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_twopass_fulltile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = 17;
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_twopass_subtile) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 10; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_x_stride) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- const size_t ks = tester.mr() + tester.qr();
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
}
-}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_multipass_fulltile) {
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__SSE2_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4);
}
}
}
}
}
-}
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_multipass_subtile) {
- for (size_t ks_max : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = ks_max - tester.qr() + 1; ks < ks_max; ks++) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kc(kc)
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kc(kc)
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, kc_gt_1_multipass_fulltile_with_x_stride) {
- for (size_t ks : std::vector<size_t>{{25, 49}}) {
- auto tester = ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= ks; kh++) {
- for (size_t kw = 1; kw <= ks; kw++) {
- if (kh * kw == ks) {
- for (size_t kc = 2; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(131)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(4)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_ARGMAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
}
+#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, y_max) {
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(5)
- .kw(5)
- .kc(kc)
- .qmax(128)
- .iterations(3)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_input_offset) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmin) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmax) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, y_min) {
- for (size_t n = 1; n <= 5; n += 2) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(5)
- .kw(5)
- .kc(kc)
- .qmin(128)
- .iterations(3)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_input_offset) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmin) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmax) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_input_offset) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmin) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmax) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n_with_x_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(29)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_input_offset) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n_with_y_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(31)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmin) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(F32_ARGMAXPOOL_MP9P8Q__SCALAR, small_n_with_s) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{5, 7}}) {
- for (size_t s = 2; s <= 5; s++) {
- for (size_t kc = 1; kc < 15; kc += 3) {
- ArgmaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .Test(xnn_f32_argmaxpool_ukernel_mp9p8q__scalar, ArgmaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmax) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_output_stride) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_ARGMAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_step) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = 10; pooling_elements <= 17; pooling_elements++) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1, ArgMaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-}
+}
\ No newline at end of file
diff --git a/test/f32-argmaxpool.yaml b/test/f32-argmaxpool.yaml
new file mode 100644
index 0000000..7903bc7
--- /dev/null
+++ b/test/f32-argmaxpool.yaml
@@ -0,0 +1,13 @@
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_f32_argmaxpool_ukernel_4x__sse2_c4
+- name: xnn_f32_argmaxpool_ukernel_4x__psimd_c4
+- name: xnn_f32_argmaxpool_ukernel_4x__scalar_c1
+- name: xnn_f32_argmaxpool_ukernel_9x__sse2_c4
+- name: xnn_f32_argmaxpool_ukernel_9x__psimd_c4
+- name: xnn_f32_argmaxpool_ukernel_9x__scalar_c1
+- name: xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4
+- name: xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4
+- name: xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1
diff --git a/test/f32-maxpool.cc b/test/f32-maxpool.cc
index 159a907..a8e453e 100644
--- a/test/f32-maxpool.cc
+++ b/test/f32-maxpool.cc
@@ -1,7 +1,15 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
// Copyright 2019 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/f32-maxpool.yaml
+// Generator: tools/generate-maxpool-test.py
+
#include <gtest/gtest.h>
@@ -13,1202 +21,884 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile_with_qmin) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_unipass_subtile_with_input_offset) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_fulltile_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_unipass_subtile) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile_with_input_offset) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile_with_qmin) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_unipass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_unipass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_unipass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_twopass_subtile) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_multipass) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_multipass_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_eq_4_multipass_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_fulltile_with_qmax) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_subtile) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass_with_qmin) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_unipass_subtile_with_input_offset) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_eq_4_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_div_4_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_lt_4_multipass_with_qmax) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_div_4_multipass_with_x_stride) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass_with_input_offset) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass_with_qmin) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass_with_qmin) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, channels_gt_4_multipass_with_qmax) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, kc_lt_4_multipass_with_x_stride) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels) {
TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass_with_qmin) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass_with_qmax) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, kc_gt_4_multipass_with_x_stride) {
- TEST_REQUIRES_X86_SSE;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__SSE, small_n) {
- TEST_REQUIRES_X86_SSE;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .iterations(3)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, small_n_with_x_stride) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_input_offset) {
TEST_REQUIRES_X86_SSE;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(101)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, small_n_with_y_stride) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_qmin) {
TEST_REQUIRES_X86_SSE;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(103)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
}
- TEST(SMAXPOOL_9P8Q__SSE, small_n_with_s) {
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_qmax) {
TEST_REQUIRES_X86_SSE;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
+ }
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__SSE_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_X86_SSE;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__sse);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__sse_c4);
}
}
}
@@ -1217,1768 +907,1329 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile) {
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile_with_qmin) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_unipass_subtile_with_input_offset) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_fulltile_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_unipass_subtile) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile_with_input_offset) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmin) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_unipass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_unipass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_unipass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_twopass_subtile) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_multipass) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_multipass_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_eq_4_multipass_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_fulltile_with_qmax) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(4);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_subtile) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass_with_qmin) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_unipass_subtile_with_input_offset) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(5)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_eq_4_multipass_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(4)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_div_4_multipass_with_qmax) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 8; channels < 32; channels += 4) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_input_offset) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(4)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmin) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_lt_4_multipass_with_qmax) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 4; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_div_4_multipass_with_x_stride) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 4; kc < 64; kc += 12) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_input_offset) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(11)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass_with_qmin) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmin) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass_with_qmax) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, channels_gt_4_multipass_with_qmax) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 5; channels < 8; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_lt_4_multipass_with_x_stride) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels) {
TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 4; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass_with_qmin) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass_with_qmax) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, kc_gt_4_multipass_with_x_stride) {
- TEST_REQUIRES_PSIMD;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 5; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-
- TEST(SMAXPOOL_9P8Q__PSIMD, small_n) {
- TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .iterations(3)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, small_n_with_x_stride) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_input_offset) {
TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(101)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, small_n_with_y_stride) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmin) {
TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(103)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
- TEST(SMAXPOOL_9P8Q__PSIMD, small_n_with_s) {
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_qmax) {
TEST_REQUIRES_PSIMD;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_MAXPOOL_9P8X__PSIMD_C4, few_output_pixels_with_step) {
+ TEST_REQUIRES_PSIMD;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 20; channels += 3) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__psimd, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(23)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__psimd_c4, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
}
-#endif // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
+#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_x_stride) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_x_stride) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_input_offset) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmin) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmax) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_input_offset) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmin) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmax) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_input_offset) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmin) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmax) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_x_stride) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 5; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 5; kc++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .iterations(3)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n_with_x_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 5; kc++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_input_offset) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(101)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n_with_y_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 5; kc++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmin) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(103)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(SMAXPOOL_9P8Q__SCALAR, small_n_with_s) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
- for (size_t kc = 1; kc < 5; kc++) {
- for (size_t s = 2; s <= ks; s++) {
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmax) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_output_stride) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(F32_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_step) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .iterations(1)
- .Test(xnn_f32_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_f32_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-}
+}
\ No newline at end of file
diff --git a/test/f32-maxpool.yaml b/test/f32-maxpool.yaml
new file mode 100644
index 0000000..e6db8b5
--- /dev/null
+++ b/test/f32-maxpool.yaml
@@ -0,0 +1,7 @@
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_f32_maxpool_ukernel_9p8x__sse_c4
+- name: xnn_f32_maxpool_ukernel_9p8x__psimd_c4
+- name: xnn_f32_maxpool_ukernel_9p8x__scalar_c1
diff --git a/test/maxpool-microkernel-tester.h b/test/maxpool-microkernel-tester.h
index 94132ed..68d57d5 100644
--- a/test/maxpool-microkernel-tester.h
+++ b/test/maxpool-microkernel-tester.h
@@ -30,115 +30,103 @@
Scalar,
};
- inline MaxPoolMicrokernelTester& n(size_t n) {
- assert(n != 0);
- this->n_ = n;
+ inline MaxPoolMicrokernelTester& output_pixels(size_t output_pixels) {
+ assert(output_pixels != 0);
+ this->output_pixels_ = output_pixels;
return *this;
}
- inline size_t n() const {
- return this->n_;
+ inline size_t output_pixels() const {
+ return this->output_pixels_;
}
- inline MaxPoolMicrokernelTester& s(size_t s) {
- assert(s != 0);
- this->s_ = s;
+ inline MaxPoolMicrokernelTester& step(size_t step) {
+ assert(step != 0);
+ this->step_ = step;
return *this;
}
- inline size_t s() const {
- return this->s_;
+ inline size_t step() const {
+ return this->step_;
}
- inline MaxPoolMicrokernelTester& kh(size_t kh) {
- assert(kh != 0);
- this->kh_ = kh;
+ inline MaxPoolMicrokernelTester& input_offset(size_t input_offset) {
+ assert(input_offset != 0);
+ this->input_offset_ = input_offset;
return *this;
}
- inline size_t kh() const {
- return this->kh_;
+ inline size_t input_offset() const {
+ return this->input_offset_;
}
- inline MaxPoolMicrokernelTester& kw(size_t kw) {
- assert(kw != 0);
- this->kw_ = kw;
+ inline MaxPoolMicrokernelTester& pooling_elements(size_t pooling_elements) {
+ assert(pooling_elements != 0);
+ this->pooling_elements_ = pooling_elements;
return *this;
}
- inline size_t kw() const {
- return this->kw_;
+ inline size_t pooling_elements() const {
+ return this->pooling_elements_;
}
- inline size_t ks() const {
- return kh() * kw();
- }
-
- inline size_t packed_ks() const {
- if (ks() <= mr()) {
- return mr();
+ inline size_t packed_pooling_elements() const {
+ if (pooling_elements() <= primary_pooling_tile()) {
+ return primary_pooling_tile();
} else {
- return (ks() - mr()) % qr() == 0 ? ks() : ((ks() - mr()) / qr() + 1) * qr() + mr();
+ return (pooling_elements() - primary_pooling_tile()) % incremental_pooling_tile() == 0 ? pooling_elements() : ((pooling_elements() - primary_pooling_tile()) / incremental_pooling_tile() + 1) * incremental_pooling_tile() + primary_pooling_tile();
}
}
- inline MaxPoolMicrokernelTester& mr(size_t mr) {
- assert(mr != 0);
- this->mr_ = mr;
+ inline MaxPoolMicrokernelTester& pooling_tile(size_t primary_tile, size_t incremental_tile) {
+ assert(primary_tile != 0);
+ this->primary_pooling_tile_ = primary_tile;
+ this->incremental_pooling_tile_ = incremental_tile;
return *this;
}
- inline size_t mr() const {
- return this->mr_;
- }
-
- inline MaxPoolMicrokernelTester& qr(size_t qr) {
- assert(qr != 0);
- this->qr_ = qr;
+ inline MaxPoolMicrokernelTester& primary_pooling_tile(size_t primary_pooling_tile) {
+ assert(primary_pooling_tile != 0);
+ this->primary_pooling_tile_ = primary_pooling_tile;
return *this;
}
- inline size_t qr() const {
- return this->qr_;
+ inline size_t primary_pooling_tile() const {
+ return this->primary_pooling_tile_;
}
- inline MaxPoolMicrokernelTester& kc(size_t kc) {
- assert(kc != 0);
- this->kc_ = kc;
+ inline MaxPoolMicrokernelTester& incremental_pooling_tile(size_t incremental_pooling_tile) {
+ assert(incremental_pooling_tile != 0);
+ this->incremental_pooling_tile_ = incremental_pooling_tile;
return *this;
}
- inline size_t kc() const {
- return this->kc_;
+ inline size_t incremental_pooling_tile() const {
+ return this->incremental_pooling_tile_;
}
- inline MaxPoolMicrokernelTester& x_stride(size_t x_stride) {
- assert(x_stride != 0);
- this->x_stride_ = x_stride;
+ inline MaxPoolMicrokernelTester& channels(size_t channels) {
+ assert(channels != 0);
+ this->channels_ = channels;
return *this;
}
- inline size_t x_stride() const {
- if (this->x_stride_ == 0) {
- return kc();
+ inline size_t channels() const {
+ return this->channels_;
+ }
+
+ inline MaxPoolMicrokernelTester& output_stride(size_t output_stride) {
+ assert(output_stride != 0);
+ this->output_stride_ = output_stride;
+ return *this;
+ }
+
+ inline size_t output_stride() const {
+ if (this->output_stride_ == 0) {
+ return channels();
} else {
- assert(this->x_stride_ >= kc());
- return this->x_stride_;
- }
- }
-
- inline MaxPoolMicrokernelTester& y_stride(size_t y_stride) {
- assert(y_stride != 0);
- this->y_stride_ = y_stride;
- return *this;
- }
-
- inline size_t y_stride() const {
- if (this->y_stride_ == 0) {
- return kc();
- } else {
- assert(this->y_stride_ >= kc());
- return this->y_stride_;
+ assert(this->output_stride_ >= channels());
+ return this->output_stride_;
}
}
@@ -174,19 +162,23 @@
auto rng = std::mt19937(random_device());
auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
- std::vector<const uint8_t*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
- std::vector<uint8_t> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(uint8_t));
-
- std::vector<uint8_t> y((n() - 1) * y_stride() + kc() + XNN_EXTRA_BYTES / sizeof(uint8_t));
- std::vector<uint8_t> y_ref(n() * kc());
+ std::vector<const uint8_t*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+ std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
+ indirect_input.size() * channels());
+ std::vector<uint8_t> output(XNN_EXTRA_BYTES / sizeof(uint8_t) +
+ (output_pixels() - 1) * output_stride() + channels());
+ std::vector<uint8_t> output_ref(output_pixels() * channels());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- std::generate(x.begin(), x.end(), std::ref(u8rng));
- std::fill(y.begin(), y.end(), 0xA5);
+ do {
+ std::generate(input.begin(), input.end(), std::ref(u8rng));
+ } while (input.size() > 1 && *std::max_element(input.cbegin(), input.cend()) == *std::min_element(input.cbegin(), input.cend()));
+ std::fill(output.begin(), output.end(), 0xA5);
- for (size_t i = 0; i < indirect_x.size(); i++) {
- indirect_x[i] = x.data() + i * x_stride();
+ for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+ indirect_input[i] = input.data() + i * channels() - input_offset();
}
- std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+ std::shuffle(indirect_input.begin(),
+ indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
// Prepare output parameters.
xnn_u8_output_params output_params = { };
@@ -200,32 +192,40 @@
}
// Compute reference results.
- for (size_t i = 0; i < n(); i++) {
- for (size_t k = 0; k < kc(); k++) {
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
uint8_t max_value = 0;
- for (size_t j = 0; j < ks(); j++) {
- max_value = std::max(max_value,
- indirect_x[i * s() * kh() + j][k]);
+ for (size_t p = 0; p < pooling_elements(); p++) {
+ max_value = std::max(max_value, indirect_input[x * step() + p][c + input_offset()]);
}
max_value = std::min(max_value, qmax());
max_value = std::max(max_value, qmin());
- y_ref[i * kc() + k] = max_value;
+ output_ref[x * channels() + c] = max_value;
}
}
// Call optimized micro-kernel.
- maxpool(n(), ks(), kc(),
- indirect_x.data(), y.data(),
- (kh() * s() - packed_ks()) * sizeof(void*),
- (y_stride() - kc()) * sizeof(uint8_t),
+ maxpool(output_pixels(), pooling_elements(), channels(),
+ indirect_input.data(), input_offset() * sizeof(uint8_t), output.data(),
+ (step() - packed_pooling_elements()) * sizeof(void*),
+ (output_stride() - channels()) * sizeof(uint8_t),
&output_params);
// Verify results.
- for (size_t i = 0; i < n(); i++) {
- for (size_t k = 0; k < kc(); k++) {
- ASSERT_EQ(uint32_t(y_ref[i * kc() + k]), uint32_t(y[i * y_stride() + k]))
- << "at pixel " << i << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_GE(uint32_t(output[x * output_stride() + c]), uint32_t(qmin()))
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_LE(uint32_t(output[x * output_stride() + c]), uint32_t(qmax()))
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(uint32_t(output_ref[x * channels() + c]), uint32_t(output[x * output_stride() + c]))
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
}
}
}
@@ -236,87 +236,94 @@
auto rng = std::mt19937(random_device());
auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
- std::vector<const float*> indirect_x(packed_ks() + (n() * s() - 1) * kh());
- std::vector<float> x((indirect_x.size() - 1) * x_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
-
- std::vector<float> y((n() - 1) * y_stride() + kc() + XNN_EXTRA_BYTES / sizeof(float));
- std::vector<float> y_ref(n() * kc());
+ std::vector<const float*> indirect_input((output_pixels() - 1) * step() + packed_pooling_elements());
+ std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+ ((output_pixels() - 1) * step() + pooling_elements()) * channels());
+ std::vector<float> output(XNN_EXTRA_BYTES / sizeof(float) +
+ (output_pixels() - 1) * output_stride() + channels());
+ std::vector<float> output_ref(output_pixels() * channels());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- std::generate(x.begin(), x.end(), std::ref(f32rng));
- std::fill(y.begin(), y.end(), nanf(""));
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
+ std::fill(output.begin(), output.end(), nanf(""));
- for (size_t i = 0; i < indirect_x.size(); i++) {
- indirect_x[i] = x.data() + i * x_stride();
+ for (size_t i = 0; i < (output_pixels() - 1) * step() + pooling_elements(); i++) {
+ indirect_input[i] = input.data() + i * channels() - input_offset();
}
- std::shuffle(indirect_x.begin(), indirect_x.end(), rng);
+ std::shuffle(indirect_input.begin(),
+ indirect_input.begin() + (output_pixels() - 1) * step() + pooling_elements(), rng);
// Compute reference results, without clamping.
- for (size_t i = 0; i < n(); i++) {
- for (size_t k = 0; k < kc(); k++) {
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
float max_value = -std::numeric_limits<float>::infinity();
- for (size_t j = 0; j < ks(); j++) {
- max_value = std::max(max_value,
- indirect_x[i * s() * kh() + j][k]);
+ for (size_t p = 0; p < pooling_elements(); p++) {
+ max_value = std::max(max_value, indirect_input[x * step() + p][c + input_offset()]);
}
- y_ref[i * kc() + k] = max_value;
+ output_ref[x * channels() + c] = max_value;
}
}
// Compute clamping parameters.
- const float accumulated_min = *std::min_element(y_ref.cbegin(), y_ref.cend());
- const float accumulated_max = *std::max_element(y_ref.cbegin(), y_ref.cend());
+ const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+ const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
const float accumulated_range = accumulated_max - accumulated_min;
- const float y_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
- const float y_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
+ const float output_min = accumulated_min + float(qmin()) / 255.0f * accumulated_range;
+ const float output_max = accumulated_max - float(255 - qmax()) / 255.0f * accumulated_range;
// Prepare output parameters.
xnn_f32_output_params output_params = { };
switch (variant) {
case Variant::Native:
- output_params = xnn_init_f32_output_params(y_min, y_max);
+ output_params = xnn_init_f32_output_params(output_min, output_max);
break;
case Variant::Scalar:
- output_params = xnn_init_scalar_f32_output_params(y_min, y_max);
+ output_params = xnn_init_scalar_f32_output_params(output_min, output_max);
break;
}
// Clamp reference results.
- for (size_t i = 0; i < n(); i++) {
- for (size_t k = 0; k < kc(); k++) {
- y_ref[i * kc() + k] = std::max(std::min(y_ref[i * kc() + k], y_max), y_min);
- }
+ for (float& output_value : output_ref) {
+ output_value = std::max(std::min(output_value, output_max), output_min);
}
// Call optimized micro-kernel.
- maxpool(n(), ks(), kc(),
- indirect_x.data(), y.data(),
- (kh() * s() - packed_ks()) * sizeof(void*),
- (y_stride() - kc()) * sizeof(float),
+ maxpool(output_pixels(), pooling_elements(), channels(),
+ indirect_input.data(), input_offset() * sizeof(float), output.data(),
+ (step() - packed_pooling_elements()) * sizeof(void*),
+ (output_stride() - channels()) * sizeof(float),
&output_params);
// Verify results.
- for (size_t i = 0; i < n(); i++) {
- for (size_t k = 0; k < kc(); k++) {
- ASSERT_EQ(y_ref[i * kc() + k], y[i * y_stride() + k])
- << "at pixel " << i << ", channel " << k << ", n = " << n()
- << ", ks = " << kh() << "x" << kw() << " (" << ks() << "), kc = " << kc();
+ for (size_t x = 0; x < output_pixels(); x++) {
+ for (size_t c = 0; c < channels(); c++) {
+ ASSERT_GE(output[x * output_stride() + c], output_min)
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_LE(output[x * output_stride() + c], output_max)
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
+ ASSERT_EQ(output_ref[x * channels() + c], output[x * output_stride() + c])
+ << "at pixel " << x << " / " << output_pixels() << ", channel " << c << " / " << channels()
+ << ", pooling elements = " << pooling_elements() << ", step = " << step()
+ << ", input offset = " << input_offset();
}
}
}
}
private:
- size_t n_{1};
- size_t s_{1};
- size_t kh_{1};
- size_t kw_{1};
- size_t mr_{1};
- size_t qr_{1};
- size_t kc_{1};
- size_t x_stride_{0};
- size_t y_stride_{0};
+ size_t output_pixels_{1};
+ size_t pooling_elements_{1};
+ size_t channels_{1};
+ size_t input_offset_{0};
+ size_t step_{1};
+ size_t primary_pooling_tile_{1};
+ size_t incremental_pooling_tile_{1};
+ size_t output_stride_{0};
uint8_t qmin_{0};
uint8_t qmax_{255};
- size_t iterations_{15};
+ size_t iterations_{3};
};
diff --git a/test/u8-maxpool.cc b/test/u8-maxpool.cc
index b5832dc..a089944 100644
--- a/test/u8-maxpool.cc
+++ b/test/u8-maxpool.cc
@@ -5,6 +5,11 @@
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/u8-maxpool.yaml
+// Generator: tools/generate-maxpool-test.py
+
#include <gtest/gtest.h>
@@ -16,1202 +21,884 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile_with_qmin) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_unipass_subtile_with_input_offset) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_fulltile_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_unipass_subtile) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile_with_input_offset) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile_with_qmin) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_unipass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_unipass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_unipass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_fulltile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_twopass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_twopass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_twopass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_twopass_subtile) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_multipass) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_multipass_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_eq_16_multipass_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_fulltile_with_qmax) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_subtile) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass_with_qmin) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_unipass_subtile_with_input_offset) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_eq_16_multipass_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_div_16_multipass_with_qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass_with_input_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass_with_qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_lt_16_multipass_with_qmax) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_div_16_multipass_with_x_stride) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass_with_input_offset) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass_with_qmin) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass_with_qmin) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, channels_gt_16_multipass_with_qmax) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_lt_16_multipass_with_x_stride) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels) {
TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass_with_qmin) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass_with_qmax) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, kc_gt_16_multipass_with_x_stride) {
- TEST_REQUIRES_ARM_NEON;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__NEON, small_n) {
- TEST_REQUIRES_ARM_NEON;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .iterations(3)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, small_n_with_x_stride) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_input_offset) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(101)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, small_n_with_y_stride) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_qmin) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(103)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
}
- TEST(U8_MAXPOOL_9P8Q__NEON, small_n_with_s) {
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_qmax) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
+ }
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__NEON_C16, few_output_pixels_with_step) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__neon);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__neon_c16);
}
}
}
@@ -1219,1203 +906,886 @@
}
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile_with_qmin) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_unipass_subtile_with_input_offset) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_fulltile_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_unipass_subtile) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile_with_input_offset) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile_with_qmin) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_unipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_unipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_unipass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_fulltile_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_twopass_subtile) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_multipass) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_multipass_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_eq_16_multipass_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_fulltile_with_qmax) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(16);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_unipass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_subtile) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass_with_qmin) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_unipass_subtile_with_input_offset) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(17)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_fulltile_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_twopass_subtile_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .input_offset(19)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_eq_16_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(16)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(131)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_div_16_multipass_with_qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 32; channels < 128; channels += 16) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass_with_input_offset) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(16)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass_with_qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_lt_16_multipass_with_qmax) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 1; channels < 16; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_div_16_multipass_with_x_stride) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 16; kc < 256; kc += 48) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass_with_input_offset) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(37)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass_with_qmin) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass_with_qmin) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass_with_qmax) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, channels_gt_16_multipass_with_qmax) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 17; channels < 32; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_lt_16_multipass_with_x_stride) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels) {
TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 1; kc < 16; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass_with_qmin) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass_with_qmax) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, kc_gt_16_multipass_with_x_stride) {
- TEST_REQUIRES_X86_SSE2;
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 17; kc < 32; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
- }
- }
- }
-
- TEST(U8_MAXPOOL_9P8Q__SSE2, small_n) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .iterations(3)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, small_n_with_x_stride) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_input_offset) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(101)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, small_n_with_y_stride) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_qmin) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(103)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
}
- TEST(U8_MAXPOOL_9P8Q__SSE2, small_n_with_s) {
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_qmax) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
- for (size_t kc = 1; kc < 51; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_output_stride) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
+ }
+ }
+ }
+ }
+
+ TEST(U8_MAXPOOL_9P8X__SSE2_C16, few_output_pixels_with_step) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 80; channels += 15) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__sse2);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(83)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__sse2_c16);
}
}
}
@@ -2423,566 +1793,443 @@
}
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_input_offset) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmin) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_fulltile_with_qmax) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_unipass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_fulltile_with_x_stride) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr(); kw++) {
- if (kh * kw == tester.mr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_unipass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = 2; ks < tester.mr(); ks++) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- tester
- .kh(kh)
- .kw(kw)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_twopass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_fulltile_with_x_stride) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t kh = 1; kh <= tester.mr() + tester.qr(); kh++) {
- for (size_t kw = 1; kw <= tester.mr() + tester.qr(); kw++) {
- if (kh * kw == tester.mr() + tester.qr()) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(kh)
- .kw(kw)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_twopass_subtile) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + 1; ks < tester.mr() + tester.qr(); ks++) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(9)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_unipass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 2; pooling_elements < 9; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_input_offset) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmin) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_fulltile_with_qmax) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_twopass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_input_offset) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmin) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_eq_1_multipass_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .kc(1);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- tester
- .kh(ks)
- .kw(1)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_fulltile_with_qmax) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_twopass_subtile_with_input_offset) {
+ for (size_t pooling_elements = 10; pooling_elements < 17; pooling_elements++) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_input_offset) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmin) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_eq_1_multipass_with_qmax) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(1)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmin) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_input_offset) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(3)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmin) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmin(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_qmax) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, channels_gt_1_multipass_with_qmax) {
+ for (size_t pooling_elements = 18; pooling_elements <= 33; pooling_elements += 3) {
+ for (size_t channels = 2; channels < 10; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(17)
+ .pooling_tile(9, 8)
+ .channels(channels)
.qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .qmax(192)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, kc_gt_1_multipass_with_x_stride) {
- auto tester = MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .iterations(3);
- for (size_t ks = tester.mr() + tester.qr() + 1; ks < tester.mr() + 3 * tester.qr(); ks += 3) {
- for (size_t kc = 2; kc < 8; kc++) {
- tester
- .kh(ks)
- .kw(1)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- tester
- .kh(1)
- .kw(ks)
- .kc(kc)
- .x_stride(257)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
- }
- }
-}
-
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 16; kc += 5) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .iterations(3)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n_with_x_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 16; kc += 5) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_input_offset) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .x_stride(101)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .input_offset(7)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n_with_y_stride) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5, 10}}) {
- for (size_t kc = 1; kc < 16; kc += 5) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmin) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .y_stride(103)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmin(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-TEST(U8_MAXPOOL_9P8Q__SCALAR, small_n_with_s) {
- for (size_t n = 2; n < 5; n++) {
- for (size_t ks : std::vector<size_t>{{2, 3, 5}}) {
- for (size_t kc = 1; kc < 16; kc += 5) {
- for (size_t s = 2; s <= ks; s++) {
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_qmax) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .qmax(192)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_output_stride) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+}
+
+TEST(U8_MAXPOOL_9P8X__SCALAR_C1, few_output_pixels_with_step) {
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, 9, 16}}) {
+ for (size_t channels = 1; channels <= 5; channels += 1) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
MaxPoolMicrokernelTester()
- .mr(9)
- .qr(8)
- .n(n)
- .kh(ks)
- .kw(ks)
- .kc(kc)
- .s(s)
- .iterations(1)
- .Test(xnn_u8_maxpool_ukernel_9p8q__scalar, MaxPoolMicrokernelTester::Variant::Scalar);
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(9, 8)
+ .step(step)
+ .channels(channels)
+ .output_stride(7)
+ .Test(xnn_u8_maxpool_ukernel_9p8x__scalar_c1, MaxPoolMicrokernelTester::Variant::Scalar);
}
}
}
}
-}
+}
\ No newline at end of file
diff --git a/test/u8-maxpool.yaml b/test/u8-maxpool.yaml
new file mode 100644
index 0000000..d9c894b
--- /dev/null
+++ b/test/u8-maxpool.yaml
@@ -0,0 +1,7 @@
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_u8_maxpool_ukernel_9p8x__neon_c16
+- name: xnn_u8_maxpool_ukernel_9p8x__sse2_c16
+- name: xnn_u8_maxpool_ukernel_9p8x__scalar_c1
diff --git a/tools/generate-argmaxpool-test.py b/tools/generate-argmaxpool-test.py
new file mode 100755
index 0000000..3160b27
--- /dev/null
+++ b/tools/generate-argmaxpool-test.py
@@ -0,0 +1,1095 @@
+#!/usr/bin/env python
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import codecs
+import math
+import os
+import re
+import sys
+import yaml
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from primes import next_prime
+import xngen
+import xnncommon
+
+
+parser = argparse.ArgumentParser(
+ description='ArgMaxPool microkernel test generator')
+parser.add_argument("-s", "--spec", metavar="FILE", required=True,
+ help="Specification (YAML) file")
+parser.add_argument("-o", "--output", metavar="FILE", required=True,
+ help='Output (C++ source) file')
+parser.set_defaults(defines=list())
+
+
+def split_ukernel_name(name):
+ match = re.match(r"^xnn_(f16|f32)_argmaxpool_ukernel_((\d+)p)?(\d+)x__(.+)_c(\d+)$", name)
+ if match is None:
+ raise ValueError("Unexpected microkernel name: " + name)
+
+ if match.group(2):
+ primary_tile = int(match.group(3))
+ incremental_tile = int(match.group(4))
+ else:
+ primary_tile = int(match.group(4))
+ incremental_tile = 0
+
+ channel_tile = int(match.group(6))
+
+ arch, isa = xnncommon.parse_target_name(target_name=match.group(5))
+ return primary_tile, incremental_tile, channel_tile, arch, isa
+
+
+ARGMAXPOOL_TEST_TEMPLATE = """\
+$if INCREMENTAL_TILE == 0:
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ $if CHANNEL_TILE > 1:
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+$if INCREMENTAL_TILE != 0:
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ $if CHANNEL_TILE > 1:
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*5)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(${CHANNEL_TILE})
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ $if CHANNEL_TILE > 1:
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ ArgMaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+$if INCREMENTAL_TILE == 0:
+ $MIN_POOLING, MAX_POOLING = 2, PRIMARY_TILE
+$else:
+ $MIN_POOLING, MAX_POOLING = PRIMARY_TILE + 1, PRIMARY_TILE + INCREMENTAL_TILE
+
+TEST(${TEST_NAME}, few_output_pixels) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*5+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_output_stride) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .channels(channels)
+ .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_step) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements = ${MIN_POOLING}; pooling_elements <= ${MAX_POOLING}; pooling_elements++) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ ArgMaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${", ".join(map(str, filter(bool, [PRIMARY_TILE, INCREMENTAL_TILE])))})
+ .step(step)
+ .channels(channels)
+ .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+ }
+}
+"""
+
+
+def generate_test_cases(ukernel, primary_tile, incremental_tile, channel_tile,
+ isa):
+ """Generates all tests cases for a ARGMAXPOOL micro-kernel.
+
+ Args:
+ ukernel: C name of the micro-kernel function.
+ primary_tile: Number of rows (pixels) processed per one iteration of the
+ primary outer loop of the micro-kernel.
+ incremental_tile: Number of rows (pixels) processed per one iteration of
+ the incremental outer loop of the micro-kernel.
+ channel_tile: Number of channels processed per one iteration of the inner
+ loops of the micro-kernel.
+ isa: instruction set required to run the micro-kernel. Generated unit test
+ will skip execution if the host processor doesn't support this ISA.
+
+ Returns:
+ Code for the test case.
+ """
+ _, test_name = ukernel.split("_", 1)
+ _, datatype, ukernel_type, _ = ukernel.split("_", 3)
+ test_args = [ukernel]
+ if not isa or isa == "psimd":
+ test_args.append("ArgMaxPoolMicrokernelTester::Variant::Scalar")
+ return xngen.preprocess(ARGMAXPOOL_TEST_TEMPLATE, {
+ "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
+ "TEST_ARGS": test_args,
+ "DATATYPE": datatype,
+ "PRIMARY_TILE": primary_tile,
+ "INCREMENTAL_TILE": incremental_tile,
+ "CHANNEL_TILE": channel_tile,
+ "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
+ "next_prime": next_prime,
+ })
+
+
+def main(args):
+ options = parser.parse_args(args)
+
+ with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
+ spec_yaml = yaml.safe_load(spec_file)
+ if not isinstance(spec_yaml, list):
+ raise ValueError("expected a list of micro-kernels in the spec")
+
+ tests = """\
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: {specification}
+// Generator: {generator}
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/argmaxpool.h>
+#include "argmaxpool-microkernel-tester.h"
+""".format(specification=options.spec, generator=sys.argv[0])
+
+ for ukernel_spec in spec_yaml:
+ name = ukernel_spec["name"]
+ primary_tile, incremental_tile, channel_tile, arch, isa = \
+ split_ukernel_name(name)
+
+ # specification can override architecture
+ arch = ukernel_spec.get("arch", arch)
+
+ test_case = generate_test_cases(name, primary_tile, incremental_tile,
+ channel_tile, isa)
+ tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
+
+ with codecs.open(options.output, "w", encoding="utf-8") as output_file:
+ output_file.write(tests)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])
diff --git a/tools/generate-maxpool-test.py b/tools/generate-maxpool-test.py
new file mode 100755
index 0000000..83edaf2
--- /dev/null
+++ b/tools/generate-maxpool-test.py
@@ -0,0 +1,1085 @@
+#!/usr/bin/env python
+# Copyright 2019 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import codecs
+import math
+import os
+import re
+import sys
+import yaml
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from primes import next_prime
+import xngen
+import xnncommon
+
+
+parser = argparse.ArgumentParser(description='MaxPool microkernel test generator')
+parser.add_argument("-s", "--spec", metavar="FILE", required=True,
+ help="Specification (YAML) file")
+parser.add_argument("-o", "--output", metavar="FILE", required=True,
+ help='Output (C++ source) file')
+parser.set_defaults(defines=list())
+
+
+def split_ukernel_name(name):
+ match = re.match(r"^xnn_(s8|u8|s16|f16|f32)_maxpool_ukernel_(\d+)p(\d+)x__(.+)_c(\d+)$", name)
+ if match is None:
+ raise ValueError("Unexpected microkernel name: " + name)
+
+ primary_tile = int(match.group(2))
+ incremental_tile = int(match.group(3))
+ channel_tile = int(match.group(5))
+
+ arch, isa = xnncommon.parse_target_name(target_name=match.group(4))
+ return primary_tile, incremental_tile, channel_tile, arch, isa
+
+
+MAXPOOL_TEST_TEMPLATE = """\
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+$if CHANNEL_TILE > 1:
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_unipass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = 2; pooling_elements < ${PRIMARY_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+$if CHANNEL_TILE > 1:
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*5)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_fulltile_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_twopass_subtile_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+1}; pooling_elements < ${PRIMARY_TILE+INCREMENTAL_TILE}; pooling_elements++) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .input_offset(${next_prime(CHANNEL_TILE+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+TEST(${TEST_NAME}, channels_eq_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(${CHANNEL_TILE})
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+}
+
+$if CHANNEL_TILE > 1:
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*8)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_div_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE*2}; channels < ${CHANNEL_TILE*8}; channels += ${CHANNEL_TILE}) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${CHANNEL_TILE})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+ TEST(${TEST_NAME}, channels_lt_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = 1; channels < ${CHANNEL_TILE}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*2)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, channels_gt_${CHANNEL_TILE}_multipass_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t pooling_elements = ${PRIMARY_TILE+INCREMENTAL_TILE+1}; pooling_elements <= ${PRIMARY_TILE+INCREMENTAL_TILE*3}; pooling_elements += 3) {
+ for (size_t channels = ${CHANNEL_TILE+1}; channels < ${10 if CHANNEL_TILE == 1 else CHANNEL_TILE*2}; channels++) {
+ MaxPoolMicrokernelTester()
+ .pooling_elements(${PRIMARY_TILE+INCREMENTAL_TILE})
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_input_offset) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .input_offset(${next_prime(CHANNEL_TILE*5+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmin) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmin(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_qmax) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .qmax(192)
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_output_stride) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .channels(channels)
+ .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+}
+
+TEST(${TEST_NAME}, few_output_pixels_with_step) {
+ $if ISA_CHECK:
+ ${ISA_CHECK};
+ for (size_t output_pixels = 2; output_pixels <= 5; output_pixels++) {
+ for (size_t pooling_elements : std::vector<size_t>{{2, ${PRIMARY_TILE}, ${PRIMARY_TILE+INCREMENTAL_TILE-1}}}) {
+ for (size_t channels = 1; channels <= ${CHANNEL_TILE*5}; channels += ${max(1, CHANNEL_TILE-1)}) {
+ for (size_t step = 2; step <= pooling_elements; step++) {
+ MaxPoolMicrokernelTester()
+ .output_pixels(output_pixels)
+ .pooling_elements(pooling_elements)
+ .pooling_tile(${PRIMARY_TILE}, ${INCREMENTAL_TILE})
+ .step(step)
+ .channels(channels)
+ .output_stride(${next_prime(CHANNEL_TILE*5+1)})
+ .Test(${", ".join(TEST_ARGS)});
+ }
+ }
+ }
+ }
+}
+"""
+
+
+def generate_test_cases(ukernel, primary_tile, incremental_tile, channel_tile,
+ isa):
+ """Generates all tests cases for a MAXPOOL micro-kernel.
+
+ Args:
+ ukernel: C name of the micro-kernel function.
+ primary_tile: Number of rows (pixels) processed per one iteration of the
+ primary outer loop of the micro-kernel.
+ incremental_tile: Number of rows (pixels) processed per one iteration of
+ the incremental outer loop of the micro-kernel.
+ channel_tile: Number of channels processed per one iteration of the inner
+ loops of the micro-kernel.
+ isa: instruction set required to run the micro-kernel. Generated unit test
+ will skip execution if the host processor doesn't support this ISA.
+
+ Returns:
+ Code for the test case.
+ """
+ _, test_name = ukernel.split("_", 1)
+ _, datatype, ukernel_type, _ = ukernel.split("_", 3)
+ test_args = [ukernel]
+ if not isa or isa == "psimd":
+ test_args.append("MaxPoolMicrokernelTester::Variant::Scalar")
+ return xngen.preprocess(MAXPOOL_TEST_TEMPLATE, {
+ "TEST_NAME": test_name.upper().replace("UKERNEL_", ""),
+ "TEST_ARGS": test_args,
+ "DATATYPE": datatype,
+ "PRIMARY_TILE": primary_tile,
+ "INCREMENTAL_TILE": incremental_tile,
+ "CHANNEL_TILE": channel_tile,
+ "ISA_CHECK": xnncommon.generate_isa_check_macro(isa),
+ "next_prime": next_prime,
+ })
+
+
+def main(args):
+ options = parser.parse_args(args)
+
+ with codecs.open(options.spec, "r", encoding="utf-8") as spec_file:
+ spec_yaml = yaml.safe_load(spec_file)
+ if not isinstance(spec_yaml, list):
+ raise ValueError("expected a list of micro-kernels in the spec")
+
+ tests = """\
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: {specification}
+// Generator: {generator}
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/maxpool.h>
+#include "maxpool-microkernel-tester.h"
+""".format(specification=options.spec, generator=sys.argv[0])
+
+ for ukernel_spec in spec_yaml:
+ name = ukernel_spec["name"]
+ primary_tile, incremental_tile, channel_tile, arch, isa = \
+ split_ukernel_name(name)
+
+ # specification can override architecture
+ arch = ukernel_spec.get("arch", arch)
+
+ test_case = generate_test_cases(name, primary_tile, incremental_tile,
+ channel_tile, isa)
+ tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
+
+ with codecs.open(options.output, "w", encoding="utf-8") as output_file:
+ output_file.write(tests)
+
+
+if __name__ == "__main__":
+ main(sys.argv[1:])