Refactor MaxPool and ArgMaxPool micro-kernels

- Support input_offset argument in MaxPool and ArgMaxPool micro-kernels
- Use input_offset to make indirection buffer independent on batch size
- Simplify and auto-generate unit tests
- Use more descriptive names for micro-kernel parameters

PiperOrigin-RevId: 281447682
diff --git a/BUILD.bazel b/BUILD.bazel
index 463b36a..31f569d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -74,9 +74,9 @@
 ]
 
 SCALAR_UKERNELS = [
-    "src/f32-argmaxpool/mp9p8q-scalar.c",
-    "src/f32-argmaxpool/up4-scalar.c",
-    "src/f32-argmaxpool/up9-scalar.c",
+    "src/f32-argmaxpool/9p8x-scalar-c1.c",
+    "src/f32-argmaxpool/4x-scalar-c1.c",
+    "src/f32-argmaxpool/9x-scalar-c1.c",
     "src/f32-avgpool/mp9p8q-scalar.c",
     "src/f32-avgpool/up9-scalar.c",
     "src/f32-bilinear/scalar-c1.c",
@@ -136,7 +136,7 @@
     "src/f32-gemminc/2x4-scalar.c",
     "src/f32-gemminc/4x4-scalar.c",
     "src/f32-hswish/scalar.c",
-    "src/f32-maxpool/9p8q-scalar.c",
+    "src/f32-maxpool/9p8x-scalar-c1.c",
     "src/f32-pavgpool/mp9p8q-scalar.c",
     "src/f32-pavgpool/up9-scalar.c",
     "src/f32-ppmm/2x4-scalar.c",
@@ -169,7 +169,7 @@
     "src/q8-vadd/scalar.c",
     "src/u8-clamp/scalar.c",
     "src/u8-lut32norm/scalar.c",
-    "src/u8-maxpool/9p8q-scalar.c",
+    "src/u8-maxpool/9p8x-scalar-c1.c",
     "src/u8-rmax/scalar.c",
     "src/x32-packx/x2-scalar.c",
     "src/x32-packx/x3-scalar.c",
@@ -188,9 +188,9 @@
 ]
 
 PSIMD_UKERNELS = [
-    "src/f32-argmaxpool/mp9p8q-psimd.c",
-    "src/f32-argmaxpool/up4-psimd.c",
-    "src/f32-argmaxpool/up9-psimd.c",
+    "src/f32-argmaxpool/9p8x-psimd-c4.c",
+    "src/f32-argmaxpool/4x-psimd-c4.c",
+    "src/f32-argmaxpool/9x-psimd-c4.c",
     "src/f32-avgpool/mp9p8q-psimd.c",
     "src/f32-avgpool/up9-psimd.c",
     "src/f32-bilinear/psimd-c4.c",
@@ -253,7 +253,7 @@
     "src/f32-gemminc/6x8-psimd-splat.c",
     "src/f32-gemminc/6x8s4-psimd.c",
     "src/f32-hswish/psimd.c",
-    "src/f32-maxpool/9p8q-psimd.c",
+    "src/f32-maxpool/9p8x-psimd-c4.c",
     "src/f32-pavgpool/mp9p8q-psimd.c",
     "src/f32-pavgpool/up9-psimd.c",
     "src/f32-ppmm/4x8-psimd.c",
@@ -337,7 +337,7 @@
     "src/q8-igemm/8x8-neon.c",
     "src/q8-vadd/neon.c",
     "src/u8-clamp/neon.c",
-    "src/u8-maxpool/9p8q-neon.c",
+    "src/u8-maxpool/9p8x-neon-c16.c",
     "src/u8-rmax/neon.c",
     "src/x32-packx/x4-neon-st4.c",
     "src/x32-pad/x2-neon.c",
@@ -476,7 +476,7 @@
     "src/f32-gemminc/4x8-sse-load1.c",
     "src/f32-gemminc/4x8s4-sse.c",
     "src/f32-hswish/sse.c",
-    "src/f32-maxpool/9p8q-sse.c",
+    "src/f32-maxpool/9p8x-sse-c4.c",
     "src/f32-pavgpool/mp9p8q-sse.c",
     "src/f32-pavgpool/up9-sse.c",
     "src/f32-dwconv-spchw/3x3p1-sse.c",
@@ -491,9 +491,9 @@
 ]
 
 SSE2_UKERNELS = [
-    "src/f32-argmaxpool/mp9p8q-sse2.c",
-    "src/f32-argmaxpool/up4-sse2.c",
-    "src/f32-argmaxpool/up9-sse2.c",
+    "src/f32-argmaxpool/9p8x-sse2-c4.c",
+    "src/f32-argmaxpool/4x-sse2-c4.c",
+    "src/f32-argmaxpool/9x-sse2-c4.c",
     "src/f32-prelu/sse2-2x4.c",
     "src/f32-prelu/sse2-2x8.c",
     "src/f32-sigmoid/sse2-p5-div-x8.c",
@@ -508,7 +508,7 @@
     "src/q8-gemm/4x4c2-sse2.c",
     "src/q8-vadd/sse2.c",
     "src/u8-clamp/sse2.c",
-    "src/u8-maxpool/9p8q-sse2.c",
+    "src/u8-maxpool/9p8x-sse2-c16.c",
     "src/u8-rmax/sse2.c",
     "src/x32-pad/x2-sse2.c",
     "src/x32-zip/x2-sse2.c",