Refactor CHW micro-kernels

Rename SpCHW -> CHW

PiperOrigin-RevId: 311861144
diff --git a/BUILD.bazel b/BUILD.bazel
index 5598d11..11984cc 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -89,11 +89,11 @@
     "src/f32-clamp/gen/scalar-x4.c",
     "src/f32-conv-hwc/3x3s2p0p1c3x4-scalar-1x1.c",
     "src/f32-conv-hwc/3x3s2p1c3x4-scalar-1x1.c",
-    "src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c",
-    "src/f32-dwconv-spchw/3x3p1-scalar.c",
-    "src/f32-dwconv-spchw/3x3s2p1-scalar.c",
-    "src/f32-dwconv-spchw/5x5p2-scalar.c",
-    "src/f32-dwconv-spchw/5x5s2p2-scalar.c",
+    "src/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c",
+    "src/f32-dwconv-chw/3x3p1-scalar.c",
+    "src/f32-dwconv-chw/3x3s2p1-scalar.c",
+    "src/f32-dwconv-chw/5x5p2-scalar.c",
+    "src/f32-dwconv-chw/5x5s2p2-scalar.c",
     "src/f32-dwconv/gen/up1x4-scalar-acc2.c",
     "src/f32-dwconv/gen/up1x4-scalar.c",
     "src/f32-dwconv/gen/up1x9-scalar-acc2.c",
@@ -118,7 +118,7 @@
     "src/f32-dwconv/gen/up2x9-minmax-scalar.c",
     "src/f32-dwconv/gen/up2x25-minmax-scalar-acc2.c",
     "src/f32-dwconv/gen/up2x25-minmax-scalar.c",
-    "src/f32-gavgpool-spchw/scalar-x1.c",
+    "src/f32-gavgpool-cw/scalar-x1.c",
     "src/f32-gavgpool/7p7x-minmax-scalar-c1.c",
     "src/f32-gavgpool/7x-minmax-scalar-c1.c",
     "src/f32-gemm/gen-inc/1x4inc-minmax-scalar.c",
@@ -533,7 +533,7 @@
     "src/f32-dwconv/gen/up4x25-minmax-neon-acc2.c",
     "src/f32-dwconv/gen/up8x25-minmax-neon.c",
     "src/f32-dwconv/gen/up8x25-minmax-neon-acc2.c",
-    "src/f32-gavgpool-spchw/neon-x4.c",
+    "src/f32-gavgpool-cw/neon-x4.c",
     "src/f32-gavgpool/7p7x-minmax-neon-c4.c",
     "src/f32-gavgpool/7x-minmax-neon-c4.c",
     "src/f32-gemm/gen/1x8-minmax-neon-lane-ld64.c",
@@ -887,11 +887,11 @@
     "src/f32-conv-hwc/3x3s2p0p1c3x4-neonfma-2x2.c",
     "src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c",
     "src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c",
-    "src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c",
-    "src/f32-dwconv-spchw/3x3p1-neonfma.c",
-    "src/f32-dwconv-spchw/5x5p2-neonfma.c",
-    "src/f32-dwconv-spchw/3x3s2p1-neonfma.c",
-    "src/f32-dwconv-spchw/5x5s2p2-neonfma.c",
+    "src/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c",
+    "src/f32-dwconv-chw/3x3p1-neonfma.c",
+    "src/f32-dwconv-chw/5x5p2-neonfma.c",
+    "src/f32-dwconv-chw/3x3s2p1-neonfma.c",
+    "src/f32-dwconv-chw/5x5s2p2-neonfma.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-div-x4.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-div-x8.c",
     "src/f32-sigmoid/gen/neonfma-rr1-p5-div-x12.c",
@@ -1017,8 +1017,8 @@
     "src/f32-avgpool/9x-minmax-sse-c4.c",
     "src/f32-clamp/gen/sse-x4.c",
     "src/f32-clamp/gen/sse-x8.c",
-    "src/f32-dwconv-spchw/3x3p1-sse.c",
-    "src/f32-dwconv-spchw/3x3s2p1-sse.c",
+    "src/f32-dwconv-chw/3x3p1-sse.c",
+    "src/f32-dwconv-chw/3x3s2p1-sse.c",
     "src/f32-dwconv/gen/up4x25-minmax-sse-acc2.c",
     "src/f32-dwconv/gen/up4x25-minmax-sse.c",
     "src/f32-dwconv/gen/up4x4-minmax-sse-acc2.c",
@@ -1031,7 +1031,7 @@
     "src/f32-dwconv/gen/up8x4-minmax-sse.c",
     "src/f32-dwconv/gen/up8x9-minmax-sse-acc2.c",
     "src/f32-dwconv/gen/up8x9-minmax-sse.c",
-    "src/f32-gavgpool-spchw/sse-x4.c",
+    "src/f32-gavgpool-cw/sse-x4.c",
     "src/f32-gavgpool/7p7x-minmax-sse-c4.c",
     "src/f32-gavgpool/7x-minmax-sse-c4.c",
     "src/f32-gemm/gen/1x8-minmax-sse-dup.c",
@@ -2304,9 +2304,9 @@
 )
 
 xnnpack_benchmark(
-    name = "f32_conv_hwc2spchw_bench",
+    name = "f32_conv_hwc2chw_bench",
     srcs = [
-        "bench/f32-conv-hwc2spchw.cc",
+        "bench/f32-conv-hwc2chw.cc",
         "bench/dconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
@@ -2324,9 +2324,9 @@
 )
 
 xnnpack_benchmark(
-    name = "f32_dwconv_spchw_bench",
+    name = "f32_dwconv_chw_bench",
     srcs = [
-        "bench/f32-dwconv-spchw.cc",
+        "bench/f32-dwconv-chw.cc",
         "bench/dwconv.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_BENCHMARK_HDRS,
@@ -2820,10 +2820,10 @@
 )
 
 xnnpack_unit_test(
-    name = "f32_conv_hwc2spchw_test",
+    name = "f32_conv_hwc2chw_test",
     srcs = [
-        "test/f32-conv-hwc2spchw.cc",
-        "test/conv-hwc2spchw-microkernel-tester.h",
+        "test/f32-conv-hwc2chw.cc",
+        "test/conv-hwc2chw-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
@@ -2850,10 +2850,10 @@
 )
 
 xnnpack_unit_test(
-    name = "f32_dwconv_spchw_test",
+    name = "f32_dwconv_chw_test",
     srcs = [
-        "test/f32-dwconv-spchw.cc",
-        "test/dwconv-spchw-microkernel-tester.h",
+        "test/f32-dwconv-chw.cc",
+        "test/dwconv-chw-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + WEIGHTS_PACK_HDRS + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
@@ -2870,10 +2870,10 @@
 )
 
 xnnpack_unit_test(
-    name = "f32_gavgpool_spchw_test",
+    name = "f32_gavgpool_cw_test",
     srcs = [
-        "test/f32-gavgpool-spchw.cc",
-        "test/gavgpool-spchw-microkernel-tester.h",
+        "test/f32-gavgpool-cw.cc",
+        "test/gavgpool-cw-microkernel-tester.h",
         "src/xnnpack/AlignedAllocator.h",
     ] + MICROKERNEL_TEST_HDRS,
     deps = MICROKERNEL_TEST_DEPS,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15088a5..4ef5181 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -209,11 +209,11 @@
   src/f32-clamp/gen/scalar-x4.c
   src/f32-conv-hwc/3x3s2p1c3x4-scalar-1x1.c
   src/f32-conv-hwc/3x3s2p0p1c3x4-scalar-1x1.c
-  src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c
-  src/f32-dwconv-spchw/3x3p1-scalar.c
-  src/f32-dwconv-spchw/3x3s2p1-scalar.c
-  src/f32-dwconv-spchw/5x5p2-scalar.c
-  src/f32-dwconv-spchw/5x5s2p2-scalar.c
+  src/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c
+  src/f32-dwconv-chw/3x3p1-scalar.c
+  src/f32-dwconv-chw/3x3s2p1-scalar.c
+  src/f32-dwconv-chw/5x5p2-scalar.c
+  src/f32-dwconv-chw/5x5s2p2-scalar.c
   src/f32-dwconv/gen/up1x4-scalar.c
   src/f32-dwconv/gen/up1x4-scalar-acc2.c
   src/f32-dwconv/gen/up1x9-scalar.c
@@ -238,7 +238,7 @@
   src/f32-dwconv/gen/up2x9-minmax-scalar-acc2.c
   src/f32-dwconv/gen/up2x25-minmax-scalar.c
   src/f32-dwconv/gen/up2x25-minmax-scalar-acc2.c
-  src/f32-gavgpool-spchw/scalar-x1.c
+  src/f32-gavgpool-cw/scalar-x1.c
   src/f32-gavgpool/7p7x-minmax-scalar-c1.c
   src/f32-gavgpool/7x-minmax-scalar-c1.c
   src/f32-gemm/gen/1x4-scalar.c
@@ -545,7 +545,7 @@
   src/f32-dwconv/gen/up4x25-minmax-neon-acc2.c
   src/f32-dwconv/gen/up8x25-minmax-neon.c
   src/f32-dwconv/gen/up8x25-minmax-neon-acc2.c
-  src/f32-gavgpool-spchw/neon-x4.c
+  src/f32-gavgpool-cw/neon-x4.c
   src/f32-gavgpool/7p7x-minmax-neon-c4.c
   src/f32-gavgpool/7x-minmax-neon-c4.c
   src/f32-gemm/gen/1x8-minmax-neon-lane-ld64.c
@@ -903,11 +903,11 @@
   src/f32-conv-hwc/3x3s2p0p1c3x4-neonfma-2x2.c
   src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c
   src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c
-  src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
-  src/f32-dwconv-spchw/3x3p1-neonfma.c
-  src/f32-dwconv-spchw/5x5p2-neonfma.c
-  src/f32-dwconv-spchw/3x3s2p1-neonfma.c
-  src/f32-dwconv-spchw/5x5s2p2-neonfma.c
+  src/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c
+  src/f32-dwconv-chw/3x3p1-neonfma.c
+  src/f32-dwconv-chw/5x5p2-neonfma.c
+  src/f32-dwconv-chw/3x3s2p1-neonfma.c
+  src/f32-dwconv-chw/5x5s2p2-neonfma.c
   src/f32-sigmoid/gen/neonfma-rr1-p5-div-x4.c
   src/f32-sigmoid/gen/neonfma-rr1-p5-div-x8.c
   src/f32-sigmoid/gen/neonfma-rr1-p5-div-x12.c
@@ -1024,8 +1024,8 @@
   src/f32-avgpool/9x-minmax-sse-c4.c
   src/f32-clamp/gen/sse-x4.c
   src/f32-clamp/gen/sse-x8.c
-  src/f32-dwconv-spchw/3x3p1-sse.c
-  src/f32-dwconv-spchw/3x3s2p1-sse.c
+  src/f32-dwconv-chw/3x3p1-sse.c
+  src/f32-dwconv-chw/3x3s2p1-sse.c
   src/f32-dwconv/gen/up4x25-minmax-sse-acc2.c
   src/f32-dwconv/gen/up4x25-minmax-sse.c
   src/f32-dwconv/gen/up4x4-minmax-sse-acc2.c
@@ -1038,7 +1038,7 @@
   src/f32-dwconv/gen/up8x4-minmax-sse.c
   src/f32-dwconv/gen/up8x9-minmax-sse-acc2.c
   src/f32-dwconv/gen/up8x9-minmax-sse.c
-  src/f32-gavgpool-spchw/sse-x4.c
+  src/f32-gavgpool-cw/sse-x4.c
   src/f32-gavgpool/7p7x-minmax-sse-c4.c
   src/f32-gavgpool/7x-minmax-sse-c4.c
   src/f32-gemm/gen/1x8-minmax-sse-dup.c
@@ -2214,23 +2214,23 @@
   TARGET_LINK_LIBRARIES(f32-conv-hwc-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(f32-conv-hwc-test f32-conv-hwc-test)
 
-  ADD_EXECUTABLE(f32-conv-hwc2spchw-test test/f32-conv-hwc2spchw.cc)
-  SET_TARGET_PROPERTIES(f32-conv-hwc2spchw-test PROPERTIES
+  ADD_EXECUTABLE(f32-conv-hwc2chw-test test/f32-conv-hwc2chw.cc)
+  SET_TARGET_PROPERTIES(f32-conv-hwc2chw-test PROPERTIES
     CXX_STANDARD 11
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(f32-conv-hwc2spchw-test PRIVATE src test)
-  TARGET_LINK_LIBRARIES(f32-conv-hwc2spchw-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
-  ADD_TEST(f32-conv-hwc2spchw-test f32-conv-hwc2spchw-test)
+  TARGET_INCLUDE_DIRECTORIES(f32-conv-hwc2chw-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-conv-hwc2chw-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-conv-hwc2chw-test f32-conv-hwc2chw-test)
 
-  ADD_EXECUTABLE(f32-dwconv-spchw-test test/f32-dwconv-spchw.cc)
-  SET_TARGET_PROPERTIES(f32-dwconv-spchw-test PROPERTIES
+  ADD_EXECUTABLE(f32-dwconv-chw-test test/f32-dwconv-chw.cc)
+  SET_TARGET_PROPERTIES(f32-dwconv-chw-test PROPERTIES
     CXX_STANDARD 11
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(f32-dwconv-spchw-test PRIVATE src test)
-  TARGET_LINK_LIBRARIES(f32-dwconv-spchw-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
-  ADD_TEST(f32-dwconv-spchw-test f32-dwconv-spchw-test)
+  TARGET_INCLUDE_DIRECTORIES(f32-dwconv-chw-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-dwconv-chw-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-dwconv-chw-test f32-dwconv-chw-test)
 
   ADD_EXECUTABLE(f32-dwconv-test test/f32-dwconv.cc)
   SET_TARGET_PROPERTIES(f32-dwconv-test PROPERTIES
@@ -2250,14 +2250,14 @@
   TARGET_LINK_LIBRARIES(f32-dwconv-minmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
   ADD_TEST(f32-dwconv-minmax-test f32-dwconv-minmax-test)
 
-  ADD_EXECUTABLE(f32-gavgpool-spchw-test test/f32-gavgpool-spchw.cc)
-  SET_TARGET_PROPERTIES(f32-gavgpool-spchw-test PROPERTIES
+  ADD_EXECUTABLE(f32-gavgpool-chw-test test/f32-gavgpool-chw.cc)
+  SET_TARGET_PROPERTIES(f32-gavgpool-chw-test PROPERTIES
     CXX_STANDARD 11
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(f32-gavgpool-spchw-test PRIVATE src test)
-  TARGET_LINK_LIBRARIES(f32-gavgpool-spchw-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
-  ADD_TEST(f32-gavgpool-spchw-test f32-gavgpool-spchw-test)
+  TARGET_INCLUDE_DIRECTORIES(f32-gavgpool-chw-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(f32-gavgpool-chw-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+  ADD_TEST(f32-gavgpool-chw-test f32-gavgpool-chw-test)
 
   ADD_EXECUTABLE(f32-gavgpool-minmax-test test/f32-gavgpool-minmax.cc)
   SET_TARGET_PROPERTIES(f32-gavgpool-minmax-test PROPERTIES
@@ -3114,14 +3114,14 @@
   TARGET_INCLUDE_DIRECTORIES(f32-conv-hwc-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
   TARGET_LINK_LIBRARIES(f32-conv-hwc-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
 
-  ADD_EXECUTABLE(f32-dwconv-spchw-bench bench/f32-dwconv-spchw.cc)
-  SET_TARGET_PROPERTIES(f32-dwconv-spchw-bench PROPERTIES
+  ADD_EXECUTABLE(f32-dwconv-chw-bench bench/f32-dwconv-chw.cc)
+  SET_TARGET_PROPERTIES(f32-dwconv-chw-bench PROPERTIES
     CXX_STANDARD 11
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS YES)
-  TARGET_INCLUDE_DIRECTORIES(f32-dwconv-spchw-bench PRIVATE src)
-  TARGET_INCLUDE_DIRECTORIES(f32-dwconv-spchw-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
-  TARGET_LINK_LIBRARIES(f32-dwconv-spchw-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
+  TARGET_INCLUDE_DIRECTORIES(f32-dwconv-chw-bench PRIVATE src)
+  TARGET_INCLUDE_DIRECTORIES(f32-dwconv-chw-bench PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+  TARGET_LINK_LIBRARIES(f32-dwconv-chw-bench PRIVATE XNNPACK cpuinfo fp16 benchmark bench-utils)
 
   ADD_EXECUTABLE(f32-dwconv-bench bench/f32-dwconv.cc)
   SET_TARGET_PROPERTIES(f32-dwconv-bench PROPERTIES
diff --git a/bench/f32-conv-hwc2spchw.cc b/bench/f32-conv-hwc2chw.cc
similarity index 86%
rename from bench/f32-conv-hwc2spchw.cc
rename to bench/f32-conv-hwc2chw.cc
index 242808f..7063144 100644
--- a/bench/f32-conv-hwc2spchw.cc
+++ b/bench/f32-conv-hwc2chw.cc
@@ -23,8 +23,8 @@
 #include <xnnpack/params.h>
 
 
-static void DConvHWC2SpCHW3X3S2P1Benchmark(benchmark::State& state,
-  xnn_f32_conv_hwc2spchw_ukernel_function conv,
+static void DConvHWC2CHW3X3S2P1Benchmark(benchmark::State& state,
+  xnn_f32_conv_hwc2chw_ukernel_function conv,
   uint32_t output_channels_tile)
 {
   if (!cpuinfo_initialize()) {
@@ -110,17 +110,17 @@
 }
 
 #if XNN_ARCH_ARM64
-  static void f32_conv_hwc2spchw_3x3s2p1c3x4__neonfma_2x2(benchmark::State& state, const char* net) {
-    DConvHWC2SpCHW3X3S2P1Benchmark(state, xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2, 4);
+  static void f32_conv_hwc2chw_3x3s2p1c3x4__neonfma_2x2(benchmark::State& state, const char* net) {
+    DConvHWC2CHW3X3S2P1Benchmark(state, xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2, 4);
   }
 
-  BENCHMARK_DCONV(f32_conv_hwc2spchw_3x3s2p1c3x4__neonfma_2x2);
+  BENCHMARK_DCONV(f32_conv_hwc2chw_3x3s2p1c3x4__neonfma_2x2);
 #endif
-  static void f32_conv_hwc2spchw_3x3s2p1c3x4__scalar_1x1(benchmark::State& state, const char* net) {
-    DConvHWC2SpCHW3X3S2P1Benchmark(state, xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, 4);
+  static void f32_conv_hwc2chw_3x3s2p1c3x4__scalar_1x1(benchmark::State& state, const char* net) {
+    DConvHWC2CHW3X3S2P1Benchmark(state, xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, 4);
   }
 
-  BENCHMARK_DCONV(f32_conv_hwc2spchw_3x3s2p1c3x4__scalar_1x1);
+  BENCHMARK_DCONV(f32_conv_hwc2chw_3x3s2p1c3x4__scalar_1x1);
 
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
 BENCHMARK_MAIN();
diff --git a/bench/f32-dwconv-spchw.cc b/bench/f32-dwconv-chw.cc
similarity index 83%
rename from bench/f32-dwconv-spchw.cc
rename to bench/f32-dwconv-chw.cc
index 8178ee7..f12195f 100644
--- a/bench/f32-dwconv-spchw.cc
+++ b/bench/f32-dwconv-chw.cc
@@ -26,7 +26,7 @@
 
 
 static void DWConvCHWBenchmark(benchmark::State& state,
-  xnn_f32_dwconv_spchw_ukernel_function dwconv,
+  xnn_f32_dwconv_chw_ukernel_function dwconv,
   uint32_t it, uint32_t ot, uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s)
 {
   if (!cpuinfo_initialize()) {
@@ -111,8 +111,8 @@
   std::vector<float> output(o_elements * num_buffers);
   std::fill(output.begin(), output.end(), std::nanf(""));
 
-  xnn_f32_spchw_params spchw_params =
-    xnn_init_f32_spchw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
+  xnn_f32_chw_params chw_params =
+    xnn_init_f32_chw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
 
   size_t buffer_index = 0;
   for (auto _ : state) {
@@ -131,7 +131,7 @@
         padding_height / 2,  // padding_top
         it * sizeof(float), ot * sizeof(float),
         input_width * sizeof(float), output_width * sizeof(float),
-        &spchw_params);
+        &chw_params);
     }
   }
 
@@ -146,7 +146,7 @@
 }
 
 static void DWConvHWoTCTBenchmark(benchmark::State& state,
-  xnn_f32_dwconv_spchw_ukernel_function dwconv,
+  xnn_f32_dwconv_chw_ukernel_function dwconv,
   uint32_t it, uint32_t ot, uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s)
 {
   if (!cpuinfo_initialize()) {
@@ -231,8 +231,8 @@
   std::vector<float> output(o_elements * num_buffers);
   std::fill(output.begin(), output.end(), std::nanf(""));
 
-  xnn_f32_spchw_params spchw_params =
-    xnn_init_f32_spchw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
+  xnn_f32_chw_params chw_params =
+    xnn_init_f32_chw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
 
   size_t buffer_index = 0;
   for (auto _ : state) {
@@ -252,7 +252,7 @@
         it * channels * sizeof(float), ot * channels * sizeof(float),
         benchmark::utils::RoundUp<size_t>(input_width, it) * channels * sizeof(float),
         benchmark::utils::RoundUp<size_t>(output_width, ot) * channels * sizeof(float),
-        &spchw_params);
+        &chw_params);
     }
   }
 
@@ -268,35 +268,35 @@
 
 #if XNN_ARCH_ARM64
   static void CHW_3x3p1__neonfma(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
   }
 
   static void CHW_5x5p2__neonfma(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma, 4, 4, 5, 5, 2, 1);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma, 4, 4, 5, 5, 2, 1);
   }
 
   static void CHW_3x3s2p1__neonfma(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma, 4, 4, 3, 3, 1, 2);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma, 4, 4, 3, 3, 1, 2);
   }
 
   static void CHW_5x5s2p2__neonfma(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma, 4, 4, 5, 5, 2, 2);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma, 4, 4, 5, 5, 2, 2);
   }
 
   static void HWo4C4_3x3p1__neonfma(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
   }
 
   static void HWo4C4_5x5p2__neonfma(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma, 4, 4, 5, 5, 2, 1);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma, 4, 4, 5, 5, 2, 1);
   }
 
   static void HWo4C4_3x3s2p1__neonfma(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma, 4, 4, 3, 3, 1, 2);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma, 4, 4, 3, 3, 1, 2);
   }
 
   static void HWo4C4_5x5s2p2__neonfma(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma, 4, 4, 5, 5, 2, 2);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma, 4, 4, 5, 5, 2, 2);
   }
 
   BENCHMARK_DWCONV(CHW_3x3p1__neonfma)
@@ -312,19 +312,19 @@
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   static void CHW_3x3p1__sse(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
   }
 
   static void CHW_3x3s2p1__sse(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse, 4, 4, 3, 3, 1, 2);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse, 4, 4, 3, 3, 1, 2);
   }
 
   static void HWo4C4_3x3p1__sse(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
   }
 
   static void HWo4C4_3x3s2p1__sse(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse, 4, 4, 3, 3, 1, 2);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse, 4, 4, 3, 3, 1, 2);
   }
 
   BENCHMARK_DWCONV(CHW_3x3p1__sse)
@@ -334,35 +334,35 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
   static void CHW_3x3p1__scalar(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, 1, 1, 3, 3, 1, 1);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, 1, 1, 3, 3, 1, 1);
   }
 
   static void CHW_5x5p2__scalar(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, 1, 1, 5, 5, 2, 1);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, 1, 1, 5, 5, 2, 1);
   }
 
   static void CHW_3x3s2p1__scalar(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, 1, 1, 3, 3, 1, 2);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, 1, 1, 3, 3, 1, 2);
   }
 
   static void CHW_5x5s2p2__scalar(benchmark::State& state, const char* net) {
-    DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, 1, 1, 5, 5, 2, 2);
+    DWConvCHWBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, 1, 1, 5, 5, 2, 2);
   }
 
   static void HWC_3x3p1__scalar(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, 1, 1, 3, 3, 1, 1);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, 1, 1, 3, 3, 1, 1);
   }
 
   static void HWC_5x5p2__scalar(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, 1, 1, 5, 5, 2, 1);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, 1, 1, 5, 5, 2, 1);
   }
 
   static void HWC_3x3s2p1__scalar(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, 1, 1, 3, 3, 1, 2);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, 1, 1, 3, 3, 1, 2);
   }
 
   static void HWC_5x5s2p2__scalar(benchmark::State& state, const char* net) {
-    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, 1, 1, 5, 5, 2, 2);
+    DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, 1, 1, 5, 5, 2, 2);
   }
 
 
diff --git a/src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c b/src/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c
similarity index 99%
rename from src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
rename to src/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c
index e01d9ab..50961ab 100644
--- a/src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
+++ b/src/f32-conv-hwc2chw/3x3s2p1c3x4-neonfma-2x2.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2(
+void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2(
     size_t input_height,
     size_t input_width,
     size_t output_y_start,
diff --git a/src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c b/src/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c
similarity index 99%
rename from src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c
rename to src/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c
index 9e26884..d1aca6d 100644
--- a/src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c
+++ b/src/f32-conv-hwc2chw/3x3s2p1c3x4-scalar-1x1.c
@@ -9,7 +9,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1(
+void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(
     size_t input_height,
     size_t input_width,
     size_t output_y_start,
diff --git a/src/f32-dwconv-spchw/3x3p1-neonfma.c b/src/f32-dwconv-chw/3x3p1-neonfma.c
similarity index 98%
rename from src/f32-dwconv-spchw/3x3p1-neonfma.c
rename to src/f32-dwconv-chw/3x3p1-neonfma.c
index 34ea960..822a231 100644
--- a/src/f32-dwconv-spchw/3x3p1-neonfma.c
+++ b/src/f32-dwconv-chw/3x3p1-neonfma.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma(
+void xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-dwconv-spchw/3x3p1-scalar.c b/src/f32-dwconv-chw/3x3p1-scalar.c
similarity index 96%
rename from src/f32-dwconv-spchw/3x3p1-scalar.c
rename to src/f32-dwconv-chw/3x3p1-scalar.c
index 607cbf7..17ebf1b 100644
--- a/src/f32-dwconv-spchw/3x3p1-scalar.c
+++ b/src/f32-dwconv-chw/3x3p1-scalar.c
@@ -9,7 +9,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar(
+void xnn_f32_dwconv_chw_ukernel_3x3p1__scalar(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -21,7 +21,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-dwconv-spchw/3x3p1-sse.c b/src/f32-dwconv-chw/3x3p1-sse.c
similarity index 98%
rename from src/f32-dwconv-spchw/3x3p1-sse.c
rename to src/f32-dwconv-chw/3x3p1-sse.c
index d8a5d03..1e89228 100644
--- a/src/f32-dwconv-spchw/3x3p1-sse.c
+++ b/src/f32-dwconv-chw/3x3p1-sse.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_3x3p1__sse(
+void xnn_f32_dwconv_chw_ukernel_3x3p1__sse(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-dwconv-spchw/3x3s2p1-neonfma.c b/src/f32-dwconv-chw/3x3s2p1-neonfma.c
similarity index 98%
rename from src/f32-dwconv-spchw/3x3s2p1-neonfma.c
rename to src/f32-dwconv-chw/3x3s2p1-neonfma.c
index f7b7395..1f936f6 100644
--- a/src/f32-dwconv-spchw/3x3s2p1-neonfma.c
+++ b/src/f32-dwconv-chw/3x3s2p1-neonfma.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma(
+void xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_height!= 0);
   assert(input_width != 0);
diff --git a/src/f32-dwconv-spchw/3x3s2p1-scalar.c b/src/f32-dwconv-chw/3x3s2p1-scalar.c
similarity index 97%
rename from src/f32-dwconv-spchw/3x3s2p1-scalar.c
rename to src/f32-dwconv-chw/3x3s2p1-scalar.c
index 31398c7..44f91e0 100644
--- a/src/f32-dwconv-spchw/3x3s2p1-scalar.c
+++ b/src/f32-dwconv-chw/3x3s2p1-scalar.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar(
+void xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_height!= 0);
   assert(input_width != 0);
diff --git a/src/f32-dwconv-spchw/3x3s2p1-sse.c b/src/f32-dwconv-chw/3x3s2p1-sse.c
similarity index 98%
rename from src/f32-dwconv-spchw/3x3s2p1-sse.c
rename to src/f32-dwconv-chw/3x3s2p1-sse.c
index 0bb4446..74167fd 100644
--- a/src/f32-dwconv-spchw/3x3s2p1-sse.c
+++ b/src/f32-dwconv-chw/3x3s2p1-sse.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse(
+void xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_height!= 0);
   assert(input_width != 0);
diff --git a/src/f32-dwconv-spchw/5x5p2-neonfma.c b/src/f32-dwconv-chw/5x5p2-neonfma.c
similarity index 99%
rename from src/f32-dwconv-spchw/5x5p2-neonfma.c
rename to src/f32-dwconv-chw/5x5p2-neonfma.c
index 215bf77..37a1914 100644
--- a/src/f32-dwconv-spchw/5x5p2-neonfma.c
+++ b/src/f32-dwconv-chw/5x5p2-neonfma.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma(
+void xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-dwconv-spchw/5x5p2-scalar.c b/src/f32-dwconv-chw/5x5p2-scalar.c
similarity index 98%
rename from src/f32-dwconv-spchw/5x5p2-scalar.c
rename to src/f32-dwconv-chw/5x5p2-scalar.c
index 7a705fc..b41011b 100644
--- a/src/f32-dwconv-spchw/5x5p2-scalar.c
+++ b/src/f32-dwconv-chw/5x5p2-scalar.c
@@ -9,7 +9,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar(
+void xnn_f32_dwconv_chw_ukernel_5x5p2__scalar(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -21,7 +21,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-dwconv-spchw/5x5s2p2-neonfma.c b/src/f32-dwconv-chw/5x5s2p2-neonfma.c
similarity index 99%
rename from src/f32-dwconv-spchw/5x5s2p2-neonfma.c
rename to src/f32-dwconv-chw/5x5s2p2-neonfma.c
index 9f21352..acfe8fe 100644
--- a/src/f32-dwconv-spchw/5x5s2p2-neonfma.c
+++ b/src/f32-dwconv-chw/5x5s2p2-neonfma.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma(
+void xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -23,7 +23,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-dwconv-spchw/5x5s2p2-scalar.c b/src/f32-dwconv-chw/5x5s2p2-scalar.c
similarity index 98%
rename from src/f32-dwconv-spchw/5x5s2p2-scalar.c
rename to src/f32-dwconv-chw/5x5s2p2-scalar.c
index 2b16c0a..e4cc53c 100644
--- a/src/f32-dwconv-spchw/5x5s2p2-scalar.c
+++ b/src/f32-dwconv-chw/5x5s2p2-scalar.c
@@ -9,7 +9,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar(
+void xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -21,7 +21,7 @@
     size_t output_tuple_stride,
     size_t input_width_stride,
     size_t output_width_stride,
-    const union xnn_f32_spchw_params params[restrict XNN_MIN_ELEMENTS(1)])
+    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
 {
   assert(input_width != 0);
   assert(input_height != 0);
diff --git a/src/f32-gavgpool-spchw/neon-x4.c b/src/f32-gavgpool-cw/neon-x4.c
similarity index 98%
rename from src/f32-gavgpool-spchw/neon-x4.c
rename to src/f32-gavgpool-cw/neon-x4.c
index 849ca22..f0daeca 100644
--- a/src/f32-gavgpool-spchw/neon-x4.c
+++ b/src/f32-gavgpool-cw/neon-x4.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_gavgpool_spchw_ukernel__neon_x4(
+void xnn_f32_gavgpool_cw_ukernel__neon_x4(
     size_t elements,
     size_t channels,
     const float* input,
diff --git a/src/f32-gavgpool-spchw/scalar-x1.c b/src/f32-gavgpool-cw/scalar-x1.c
similarity index 96%
rename from src/f32-gavgpool-spchw/scalar-x1.c
rename to src/f32-gavgpool-cw/scalar-x1.c
index 96e7977..6805f56 100644
--- a/src/f32-gavgpool-spchw/scalar-x1.c
+++ b/src/f32-gavgpool-cw/scalar-x1.c
@@ -9,7 +9,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_gavgpool_spchw_ukernel__scalar_x1(
+void xnn_f32_gavgpool_cw_ukernel__scalar_x1(
     size_t elements,
     size_t channels,
     const float* input,
diff --git a/src/f32-gavgpool-spchw/sse-x4.c b/src/f32-gavgpool-cw/sse-x4.c
similarity index 98%
rename from src/f32-gavgpool-spchw/sse-x4.c
rename to src/f32-gavgpool-cw/sse-x4.c
index 8a100b4..52ba111 100644
--- a/src/f32-gavgpool-spchw/sse-x4.c
+++ b/src/f32-gavgpool-cw/sse-x4.c
@@ -11,7 +11,7 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_gavgpool_spchw_ukernel__sse_x4(
+void xnn_f32_gavgpool_cw_ukernel__sse_x4(
     size_t elements,
     size_t channels,
     const float* input,
diff --git a/src/init.c b/src/init.c
index 3194357..1d3bba1 100644
--- a/src/init.c
+++ b/src/init.c
@@ -551,39 +551,39 @@
           .mr = 8,
           .nr = 4,
         };
-        xnn_params.f32.hwc2spchw_dconv3x3c3s2 = (struct hwc2spchw_dconv_parameters) {
+        xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
           .ukernel_with_symm_padding =
-            (xnn_conv_hwc2spchw_ukernel_function) xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1,
+            (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
           .output_channel_tile = 4,
           .output_height_tile = 1,
           .output_width_tile = 1,
         };
-        xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
-          .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar,
+        xnn_params.f32.dwconv_chw_3x3 = (struct dwconv_chw_parameters) {
+          .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3p1__scalar,
           .input_width_tile = 1,
           .output_width_tile = 1,
           .output_height_tile = 1,
         };
-        xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
-          .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar,
+        xnn_params.f32.dwconv_chw_3x3s2 = (struct dwconv_chw_parameters) {
+          .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar,
           .input_width_tile = 1,
           .output_width_tile = 1,
           .output_height_tile = 1,
         };
-        xnn_params.f32.spchw_dwconv5x5 = (struct spchw_dwconv_parameters) {
-          .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar,
+        xnn_params.f32.dwconv_chw_5x5 = (struct dwconv_chw_parameters) {
+          .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_5x5p2__scalar,
           .input_width_tile = 1,
           .output_width_tile = 1,
           .output_height_tile = 1,
         };
-        xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
-          .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar,
+        xnn_params.f32.dwconv_chw_5x5s2 = (struct dwconv_chw_parameters) {
+          .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar,
           .input_width_tile = 1,
           .output_width_tile = 1,
           .output_height_tile = 1,
         };
-        xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
-          .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__scalar_x1,
+        xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
+          .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
           .channel_tile = 1,
         };
       #endif  // XNN_NO_NCHW_OPERATORS
@@ -954,39 +954,39 @@
         .mr = 16,
         .nr = 4,
       };
-      xnn_params.f32.hwc2spchw_dconv3x3c3s2 = (struct hwc2spchw_dconv_parameters) {
+      xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
         .ukernel_with_symm_padding =
-          (xnn_conv_hwc2spchw_ukernel_function) xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2,
+          (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
         .output_channel_tile = 4,
         .output_height_tile = 2,
         .output_width_tile = 2,
       };
-      xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma,
+      xnn_params.f32.dwconv_chw_3x3 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma,
         .input_width_tile = 4,
         .output_width_tile = 4,
         .output_height_tile = 3,
       };
-      xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma,
+      xnn_params.f32.dwconv_chw_3x3s2 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma,
         .input_width_tile = 4,
         .output_width_tile = 4,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_dwconv5x5 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma,
+      xnn_params.f32.dwconv_chw_5x5 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma,
         .input_width_tile = 4,
         .output_width_tile = 4,
         .output_height_tile = 3,
       };
-      xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma,
+      xnn_params.f32.dwconv_chw_5x5s2 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma,
         .input_width_tile = 4,
         .output_width_tile = 4,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
-        .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__neon_x4,
+      xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
+        .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
         .channel_tile = 4,
       };
     #endif  // XNN_NO_NCHW_OPERATORS
@@ -1367,20 +1367,20 @@
         .mr = 4,
         .nr = 1,
       };
-      xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__sse,
+      xnn_params.f32.dwconv_chw_3x3 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3p1__sse,
         .input_width_tile = 4,
         .output_width_tile = 4,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse,
+      xnn_params.f32.dwconv_chw_3x3s2 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse,
         .input_width_tile = 4,
         .output_width_tile = 4,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
-        .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__sse_x4,
+      xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
+        .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
         .channel_tile = 4,
       };
     #endif  // XNN_NO_NCHW_OPERATORS
@@ -1797,39 +1797,39 @@
         .mr = 8,
         .nr = 4,
       };
-      xnn_params.f32.hwc2spchw_dconv3x3c3s2 = (struct hwc2spchw_dconv_parameters) {
+      xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
         .ukernel_with_symm_padding =
-          (xnn_conv_hwc2spchw_ukernel_function) xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1,
+          (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
         .output_channel_tile = 4,
         .output_height_tile = 1,
         .output_width_tile = 1,
       };
-      xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar,
+      xnn_params.f32.dwconv_chw_3x3 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3p1__scalar,
         .input_width_tile = 1,
         .output_width_tile = 1,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar,
+      xnn_params.f32.dwconv_chw_3x3s2 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar,
         .input_width_tile = 1,
         .output_width_tile = 1,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_dwconv5x5 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar,
+      xnn_params.f32.dwconv_chw_5x5 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_5x5p2__scalar,
         .input_width_tile = 1,
         .output_width_tile = 1,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
-        .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar,
+      xnn_params.f32.dwconv_chw_5x5s2 = (struct dwconv_chw_parameters) {
+        .ukernel = (xnn_dwconv_chw_ukernel_function) xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar,
         .input_width_tile = 1,
         .output_width_tile = 1,
         .output_height_tile = 1,
       };
-      xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
-        .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__scalar_x1,
+      xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
+        .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
         .channel_tile = 1,
       };
     #endif  // XNN_NO_NCHW_OPERATORS
diff --git a/src/operator-run.c b/src/operator-run.c
index 45b5e84..fe05f8d 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -291,13 +291,13 @@
       &context->params);
 }
 
-void xnn_compute_dconv2d_hwc2spchw(
-      const struct dconv2d_context context[restrict XNN_MIN_ELEMENTS(1)],
+void xnn_compute_conv2d_hwc2chw(
+      const struct conv2d_context context[restrict XNN_MIN_ELEMENTS(1)],
       size_t batch_index,
       size_t output_y_start,
       size_t output_y_slice)
 {
-  context->hwc2spchw_ukernel(
+  context->hwc2chw_ukernel(
       context->input_height,
       context->input_width,
       output_y_start,
@@ -328,12 +328,12 @@
     &context->params);
 }
 
-void xnn_compute_dwconv2d_spchw(
+void xnn_compute_dwconv2d_chw(
     const struct dwconv2d_context context[restrict XNN_MIN_ELEMENTS(1)],
     size_t batch_index,
     size_t channel)
 {
-  context->spchw_ukernel(
+  context->chw_ukernel(
     context->input_height,
     context->input_width,
     (const void*) ((uintptr_t) context->input + channel * context->input_channel_stride + batch_index * context->input_batch_stride),
diff --git a/src/operators/convolution-nchw.c b/src/operators/convolution-nchw.c
index d959ff4..f7b47bb 100644
--- a/src/operators/convolution-nchw.c
+++ b/src/operators/convolution-nchw.c
@@ -141,7 +141,7 @@
   status = xnn_status_unsupported_parameter;
 
   enum xnn_ukernel_type ukernel_type;
-  struct spchw_dwconv_parameters* dwconv_parameters = NULL;
+  struct dwconv_chw_parameters* dwconv_parameters = NULL;
   // Supported cases:
   // + 1x1 convolution (no groups)
   // + 3x3 stride-2 with 3 input channels and NHWC input layout
@@ -158,33 +158,33 @@
     ukernel_type = xnn_ukernel_type_spmm;
   } else if (is_3x3 && subsampling_height == 2 && subsampling_width == 2 &&
     input_padding_top == 1 && input_padding_left == 1 && input_padding_bottom == 1 && input_padding_right == 1 &&
-    nhwc_input && groups == 1 && xnn_params.f32.hwc2spchw_dconv3x3c3s2.ukernel_with_symm_padding != NULL)
+    nhwc_input && groups == 1 && xnn_params.f32.conv_hwc2chw_3x3c3s2.ukernel_with_symm_padding != NULL)
   {
-    ukernel_type = xnn_ukernel_type_dconv2d_hwc2spchw;
+    ukernel_type = xnn_ukernel_type_conv2d_hwc2chw;
   } else if (is_3x3 && subsampling_height == 1 && subsampling_width == 1 &&
     input_padding_top == 1 && input_padding_left == 1 && input_padding_bottom == 1 && input_padding_right == 1 &&
-    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.spchw_dwconv3x3.ukernel != NULL)
+    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.dwconv_chw_3x3.ukernel != NULL)
   {
     ukernel_type = xnn_ukernel_type_dwconv;
-    dwconv_parameters = &xnn_params.f32.spchw_dwconv3x3;
+    dwconv_parameters = &xnn_params.f32.dwconv_chw_3x3;
   } else if (is_3x3 && subsampling_height == 2 && subsampling_width == 2 &&
     (input_padding_top == 0 || input_padding_top == 1) && input_padding_left == 1 && input_padding_bottom == 1 && input_padding_right == 1 &&
-    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.spchw_dwconv3x3s2.ukernel != NULL)
+    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.dwconv_chw_3x3s2.ukernel != NULL)
   {
     ukernel_type = xnn_ukernel_type_dwconv;
-    dwconv_parameters = &xnn_params.f32.spchw_dwconv3x3s2;
+    dwconv_parameters = &xnn_params.f32.dwconv_chw_3x3s2;
   } else if (is_5x5 && subsampling_height == 1 && subsampling_width == 1 &&
     input_padding_top == 2 && input_padding_left == 2 && input_padding_bottom == 2 && input_padding_right == 2 &&
-    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.spchw_dwconv5x5.ukernel != NULL)
+    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.dwconv_chw_5x5.ukernel != NULL)
   {
     ukernel_type = xnn_ukernel_type_dwconv;
-    dwconv_parameters = &xnn_params.f32.spchw_dwconv5x5;
+    dwconv_parameters = &xnn_params.f32.dwconv_chw_5x5;
   } else if (is_5x5 && subsampling_height == 2 && subsampling_width == 2 &&
     (input_padding_top == 1 || input_padding_top == 2) && input_padding_left == 2 && input_padding_bottom == 2 && input_padding_right == 2 &&
-    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.spchw_dwconv5x5s2.ukernel != NULL)
+    !nhwc_input && group_input_channels == 1 && group_output_channels == 1 && xnn_params.f32.dwconv_chw_5x5s2.ukernel != NULL)
   {
     ukernel_type = xnn_ukernel_type_dwconv;
-    dwconv_parameters = &xnn_params.f32.spchw_dwconv5x5s2;
+    dwconv_parameters = &xnn_params.f32.dwconv_chw_5x5s2;
   } else {
     xnn_log_error(
       "failed to create Convolution operator: only selected Convolution parameters are supported");
@@ -376,12 +376,12 @@
 
       break;
     }
-    case xnn_ukernel_type_dconv2d_hwc2spchw:
+    case xnn_ukernel_type_conv2d_hwc2chw:
     {
       assert(groups == 1);
 
       const size_t packed_group_output_channels =
-        round_up(group_output_channels, xnn_params.f32.hwc2spchw_dconv3x3c3s2.output_channel_tile);
+        round_up(group_output_channels, xnn_params.f32.conv_hwc2chw_3x3c3s2.output_channel_tile);
       const size_t packed_weights_size = groups * packed_group_output_channels *
         (group_input_channels * kernel_height * kernel_width + 1 /* bias */) * sizeof(float);
       convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
@@ -393,14 +393,14 @@
       xnn_pack_f32_dconv_oki_w(
         group_output_channels,
         group_input_channels,
-        xnn_params.f32.hwc2spchw_dconv3x3c3s2.output_channel_tile,
+        xnn_params.f32.conv_hwc2chw_3x3c3s2.output_channel_tile,
         kernel_height, kernel_width,
         kernel, bias, convolution_op->packed_weights);
 
-      convolution_op->ukernel.dconv2d = (struct xnn_ukernel_dconv2d) {
-        .hwc2spchw_function = xnn_params.f32.hwc2spchw_dconv3x3c3s2.ukernel_with_symm_padding,
-        .output_height_tile = xnn_params.f32.hwc2spchw_dconv3x3c3s2.output_height_tile,
-        .output_channel_tile = xnn_params.f32.hwc2spchw_dconv3x3c3s2.output_channel_tile,
+      convolution_op->ukernel.conv2d = (struct xnn_ukernel_conv2d) {
+        .hwc2chw_function = xnn_params.f32.conv_hwc2chw_3x3c3s2.ukernel_with_symm_padding,
+        .output_height_tile = xnn_params.f32.conv_hwc2chw_3x3c3s2.output_height_tile,
+        .output_channel_tile = xnn_params.f32.conv_hwc2chw_3x3c3s2.output_channel_tile,
       };
 
       break;
@@ -418,12 +418,12 @@
         goto error;
       }
 
-      xnn_pack_f32_spchw_dwconv_ghw_w(
+      xnn_pack_f32_chw_dwconv_ghw_w(
         kernel_height * kernel_width, groups,
         kernel, bias, convolution_op->packed_weights);
 
       convolution_op->ukernel.dwconv2d = (struct xnn_ukernel_dwconv2d) {
-        .spchw_function = dwconv_parameters->ukernel,
+        .chw_function = dwconv_parameters->ukernel,
         .input_width_tile = dwconv_parameters->input_width_tile,
         .output_width_tile = dwconv_parameters->output_width_tile,
       };
@@ -450,7 +450,7 @@
   convolution_op->group_output_channels = group_output_channels;
 
   if (ukernel_type == xnn_ukernel_type_dwconv) {
-    convolution_op->f32_spchw_params = xnn_init_f32_spchw_params(0, output_min, output_max);
+    convolution_op->f32_chw_params = xnn_init_f32_chw_params(0, output_min, output_max);
   } else {
     convolution_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
   }
@@ -482,7 +482,7 @@
   uint32_t bias_element_size,
   uint32_t log2_output_element_size,
   const void* params,
-  const void* spchw_params,
+  const void* chw_params,
   size_t num_threads)
 {
   convolution_op->state = xnn_run_state_invalid;
@@ -601,7 +601,7 @@
 
       return xnn_status_success;
     }
-    case xnn_ukernel_type_dconv2d_hwc2spchw:
+    case xnn_ukernel_type_conv2d_hwc2chw:
     {
       const size_t zero_size = (input_width * convolution_op->group_input_channels << log2_input_element_size) + XNN_EXTRA_BYTES;
       void* zero_buffer = xnn_reallocate_memory(convolution_op->zero_buffer, zero_size);
@@ -612,7 +612,7 @@
       memset(zero_buffer, 0, zero_size);
       convolution_op->zero_buffer = zero_buffer;
 
-      convolution_op->context.dconv2d = (struct dconv2d_context) {
+      convolution_op->context.conv2d = (struct conv2d_context) {
         .input_height = input_height,
         .input_width = input_width,
         .input = input,
@@ -625,12 +625,12 @@
         .output_channels = convolution_op->group_output_channels,
         .output_height_stride = output_width << log2_output_element_size,
         .output_channel_stride = output_height * output_width << log2_output_element_size,
-        .hwc2spchw_ukernel = convolution_op->ukernel.dconv2d.hwc2spchw_function,
+        .hwc2chw_ukernel = convolution_op->ukernel.conv2d.hwc2chw_function,
       };
-      memcpy(&convolution_op->context.dconv2d.params, params, sizeof(convolution_op->context.dconv2d.params));
+      memcpy(&convolution_op->context.conv2d.params, params, sizeof(convolution_op->context.conv2d.params));
 
       size_t output_height_slice = output_height;
-      const size_t output_height_tile = convolution_op->ukernel.dconv2d.output_height_tile;
+      const size_t output_height_tile = convolution_op->ukernel.conv2d.output_height_tile;
       if (num_threads > 1) {
         const size_t target_tiles_per_thread = 5;
         const size_t max_output_height_slice = divide_round_up(output_height, num_threads * target_tiles_per_thread);
@@ -640,7 +640,7 @@
         }
       }
       convolution_op->compute.type = xnn_parallelization_type_2d_tile_1d;
-      convolution_op->compute.task_2d_tile_1d = (pthreadpool_task_2d_tile_1d_t) xnn_compute_dconv2d_hwc2spchw;
+      convolution_op->compute.task_2d_tile_1d = (pthreadpool_task_2d_tile_1d_t) xnn_compute_conv2d_hwc2chw;
       convolution_op->compute.range[0] = batch_size;
       convolution_op->compute.range[1] = output_height;
       convolution_op->compute.tile[0] = output_height_slice;
@@ -659,7 +659,7 @@
       memset(zero_buffer, 0, zero_size);
       convolution_op->zero_buffer = zero_buffer;
 
-      xnn_update_f32_spchw_params((union xnn_f32_spchw_params*) spchw_params, (uint32_t) input_width);
+      xnn_update_f32_chw_params((union xnn_f32_chw_params*) chw_params, (uint32_t) input_width);
       convolution_op->context.dwconv2d = (struct dwconv2d_context) {
         .input_height = input_height,
         .input_width = input_width,
@@ -678,12 +678,12 @@
         .output_tuple_stride = convolution_op->ukernel.dwconv2d.output_width_tile << log2_output_element_size,
         .input_pixel_stride = input_width << log2_input_element_size,
         .output_pixel_stride = output_width << log2_output_element_size,
-        .spchw_ukernel = convolution_op->ukernel.dwconv2d.spchw_function,
+        .chw_ukernel = convolution_op->ukernel.dwconv2d.chw_function,
       };
-      memcpy(&convolution_op->context.dwconv2d.params, spchw_params, sizeof(convolution_op->context.dwconv2d.params));
+      memcpy(&convolution_op->context.dwconv2d.params, chw_params, sizeof(convolution_op->context.dwconv2d.params));
 
       convolution_op->compute.type = xnn_parallelization_type_2d;
-      convolution_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv2d_spchw;
+      convolution_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_dwconv2d_chw;
       convolution_op->compute.range[0] = batch_size;
       convolution_op->compute.range[1] = groups;
       convolution_op->state = xnn_run_state_ready;
@@ -721,6 +721,6 @@
     sizeof(float) /* sizeof(bias element) */,
     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
     &convolution_op->f32_minmax_params,
-    &convolution_op->f32_spchw_params,
+    &convolution_op->f32_chw_params,
     pthreadpool_get_threads_count(threadpool));
 }
diff --git a/src/operators/global-average-pooling-ncw.c b/src/operators/global-average-pooling-ncw.c
index 2f6db07..a18f4ac 100644
--- a/src/operators/global-average-pooling-ncw.c
+++ b/src/operators/global-average-pooling-ncw.c
@@ -62,7 +62,7 @@
   }
 
   status = xnn_status_unsupported_parameter;
-  if (xnn_params.f32.spchw_gavgpool.ukernel == NULL) {
+  if (xnn_params.f32.gavgpool_cw.ukernel == NULL) {
     xnn_log_error(
       "failed to create Global Average Pooling operator: "
       "only selected configurations parameters are supported");
@@ -133,7 +133,7 @@
     .output = output,
     .output_channel_stride = sizeof(float),
     .output_batch_stride = global_average_pooling_op->channels * sizeof(float),
-    .ukernel = xnn_params.f32.spchw_gavgpool.ukernel,
+    .ukernel = xnn_params.f32.gavgpool_cw.ukernel,
     .params.f32 = global_average_pooling_op->f32_gavgpool_params,
   };
 
@@ -142,7 +142,7 @@
     (pthreadpool_task_2d_tile_1d_t) xnn_compute_global_average_pooling_ncw;
   global_average_pooling_op->compute.range[0] = batch_size;
   global_average_pooling_op->compute.range[1] = global_average_pooling_op->channels;
-  global_average_pooling_op->compute.tile[0] = global_average_pooling_op->channels; //xnn_params.f32.spchw_gavgpool.channel_tile;
+  global_average_pooling_op->compute.tile[0] = global_average_pooling_op->channels; //xnn_params.f32.gavgpool_cw.channel_tile;
 
   global_average_pooling_op->state = xnn_run_state_ready;
 
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 540fd29..a5d53a2 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -301,7 +301,7 @@
       size_t nr_block_size);
 #endif
 
-struct dconv2d_context {
+struct conv2d_context {
   size_t input_height;
   size_t input_width;
   const void* input;
@@ -315,7 +315,7 @@
   size_t output_height_stride;
   size_t output_channel_stride;
   union {
-    xnn_conv_hwc2spchw_ukernel_function hwc2spchw_ukernel;
+    xnn_conv_hwc2chw_ukernel_function hwc2chw_ukernel;
   };
   union {
     union xnn_f32_minmax_params f32;
@@ -323,8 +323,8 @@
 };
 
 #ifndef __cplusplus
-  XNN_PRIVATE void xnn_compute_dconv2d_hwc2spchw(
-      const struct dconv2d_context context[restrict XNN_MIN_ELEMENTS(1)],
+  XNN_PRIVATE void xnn_compute_conv2d_hwc2chw(
+      const struct conv2d_context context[restrict XNN_MIN_ELEMENTS(1)],
       size_t batch_index,
       size_t output_y_start,
       size_t output_y_slice);
@@ -373,15 +373,15 @@
   size_t input_pixel_stride;
   size_t output_pixel_stride;
   union {
-    union xnn_f32_spchw_params f32;
+    union xnn_f32_chw_params f32;
   } params;
   union {
-    xnn_dwconv_spchw_ukernel_function spchw_ukernel;
+    xnn_dwconv_chw_ukernel_function chw_ukernel;
   };
 };
 
 #ifndef __cplusplus
-  XNN_PRIVATE void xnn_compute_dwconv2d_spchw(
+  XNN_PRIVATE void xnn_compute_dwconv2d_chw(
       const struct dwconv2d_context context[restrict XNN_MIN_ELEMENTS(1)],
       size_t batch_index,
       size_t channel);
@@ -585,7 +585,7 @@
   void* output;
   size_t output_channel_stride;
   size_t output_batch_stride;
-  xnn_gavgpool_spchw_ukernel_function ukernel;
+  xnn_gavgpool_cw_ukernel_function ukernel;
   union {
     union xnn_f32_gavgpool_params f32;
   } params;
diff --git a/src/xnnpack/conv.h b/src/xnnpack/conv.h
index 892e12c..43b0cf9 100644
--- a/src/xnnpack/conv.h
+++ b/src/xnnpack/conv.h
@@ -43,7 +43,7 @@
 DECLARE_F32_CONV_HWC_UKERNEL_FUNCTION(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1)
 
 
-#define DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(fn_name) \
+#define DECLARE_F32_CONV_HWC2CHW_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                                 \
       size_t input_height,                                   \
       size_t input_width,                                    \
@@ -59,8 +59,8 @@
       size_t output_channel_stride,                          \
       const union xnn_f32_minmax_params* params);
 
-DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2)
-DECLARE_F32_CONV_HWC2SPCHW_UKERNEL_FUNCTION(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1)
+DECLARE_F32_CONV_HWC2CHW_UKERNEL_FUNCTION(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2)
+DECLARE_F32_CONV_HWC2CHW_UKERNEL_FUNCTION(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1)
 
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index 0ec0e30..fef788b 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -206,31 +206,31 @@
 DECLARE_Q8_DWCONV_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_q8_dwconv_minmax_ukernel_up8x9__sse2)
 
 
-#define DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(fn_name) \
-  XNN_INTERNAL void fn_name(                               \
-    size_t input_height,                                   \
-    size_t input_width,                                    \
-    const float* input,                                    \
-    const float* weights,                                  \
-    const float* zero,                                     \
-    float* output,                                         \
-    uint32_t padding_top,                                  \
-    size_t input_tuple_stride,                             \
-    size_t output_tuple_stride,                            \
-    size_t input_height_stride,                            \
-    size_t output_height_stride,                           \
-    const union xnn_f32_spchw_params* params);
+#define DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                             \
+    size_t input_height,                                 \
+    size_t input_width,                                  \
+    const float* input,                                  \
+    const float* weights,                                \
+    const float* zero,                                   \
+    float* output,                                       \
+    uint32_t padding_top,                                \
+    size_t input_tuple_stride,                           \
+    size_t output_tuple_stride,                          \
+    size_t input_height_stride,                          \
+    size_t output_height_stride,                         \
+    const union xnn_f32_chw_params* params);
 
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma)
-DECLARE_F32_DWCONV_SPCHW_UKERNEL_FUNCTION(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_3x3p1__sse)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma)
+DECLARE_F32_DWCONV_CHW_UKERNEL_FUNCTION(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse)
 
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index 41bda93..7a904b6 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h
@@ -84,17 +84,17 @@
 DECLARE_Q8_GAVGPOOL_MINMAX_UNIPASS_UKERNEL_FUNCTION(xnn_q8_gavgpool_minmax_ukernel_7x__scalar_c1)
 
 
-#define DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(fn_name) \
-  XNN_INTERNAL void fn_name(                                 \
-      size_t elements,                                       \
-      size_t channels,                                       \
-      const float* input,                                    \
-      float* output,                                         \
+#define DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                               \
+      size_t elements,                                     \
+      size_t channels,                                     \
+      const float* input,                                  \
+      float* output,                                       \
       const union xnn_f32_gavgpool_params* params);
 
-DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__neon_x4)
-DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__sse_x4)
-DECLARE_F32_GAVGPOOL_SPCHW_UKERNEL_FUNCTION(xnn_f32_gavgpool_spchw_ukernel__scalar_x1)
+DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__neon_x4)
+DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__sse_x4)
+DECLARE_F32_GAVGPOOL_CW_UKERNEL_FUNCTION(xnn_f32_gavgpool_cw_ukernel__scalar_x1)
 
 
 #ifdef __cplusplus
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index c9002ea..ded7c73 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -25,7 +25,7 @@
   xnn_ukernel_type_binary_elementwise,
   xnn_ukernel_type_channel_shuffle,
   xnn_ukernel_type_clamp,
-  xnn_ukernel_type_dconv2d_hwc2spchw,
+  xnn_ukernel_type_conv2d_hwc2chw,
   xnn_ukernel_type_dwconv,
   xnn_ukernel_type_gemm,
   xnn_ukernel_type_global_average_pooling,
@@ -85,9 +85,9 @@
   xnn_operator_type_unpooling_nhwc_x32,
 };
 
-struct xnn_ukernel_dconv2d {
+struct xnn_ukernel_conv2d {
   union {
-    xnn_conv_hwc2spchw_ukernel_function hwc2spchw_function;
+    xnn_conv_hwc2chw_ukernel_function hwc2chw_function;
     xnn_conv_hwc_ukernel_function hwc_function;
   };
   uint8_t output_height_tile;
@@ -106,7 +106,7 @@
 // Direct 2D Depthwise Convolution
 struct xnn_ukernel_dwconv2d {
   union {
-    xnn_dwconv_spchw_ukernel_function spchw_function;
+    xnn_dwconv_chw_ukernel_function chw_function;
   };
   uint8_t input_width_tile;
   uint8_t output_width_tile;
@@ -142,7 +142,7 @@
 struct xnn_ukernel {
   enum xnn_ukernel_type type;
   union {
-    struct xnn_ukernel_dconv2d dconv2d;
+    struct xnn_ukernel_conv2d conv2d;
     struct xnn_ukernel_dwconv dwconv;
     struct xnn_ukernel_dwconv2d dwconv2d;
     struct xnn_ukernel_gemm gemm;
@@ -249,7 +249,7 @@
       union xnn_f32_scaleminmax_params f32_scaleminmax_params;
       union xnn_f32_minmax_params f32_minmax_params;
     };
-    union xnn_f32_spchw_params f32_spchw_params;
+    union xnn_f32_chw_params f32_chw_params;
     union xnn_q8_add_params q8_add_params;
     union xnn_q8_gemm_params q8_gemm_params;
     // Average Pooling normally use q8_avgpool_params, but also initialize q8_gavgpool_params in case it needs to switch
@@ -272,7 +272,7 @@
     struct average_pooling_context average_pooling;
     struct channel_pad_context channel_pad;
     struct channel_shuffle_context channel_shuffle;
-    struct dconv2d_context dconv2d;
+    struct conv2d_context conv2d;
     struct dwconv2d_context dwconv2d;
     struct dwconv_context dwconv;
     struct elementwise_binary_context elementwise_binary;
diff --git a/src/xnnpack/pack.h b/src/xnnpack/pack.h
index 64d17b3..0535435 100644
--- a/src/xnnpack/pack.h
+++ b/src/xnnpack/pack.h
@@ -778,7 +778,7 @@
   }
 }
 
-static inline void xnn_pack_f16_spchw_dwconv_ghw_w(
+static inline void xnn_pack_f16_chw_dwconv_ghw_w(
   size_t kernel_size,
   size_t groups,
   const uint16_t* kernel,
@@ -1205,7 +1205,7 @@
   }
 }
 
-static inline void xnn_pack_f32_spchw_dwconv_ghw_w(
+static inline void xnn_pack_f32_chw_dwconv_ghw_w(
   size_t kernel_size,
   size_t groups,
   const float* kernel,
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index 50ab491..0916e93 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -455,12 +455,12 @@
   return params;
 }
 
-static inline union xnn_f32_spchw_params xnn_init_f32_spchw_params(
+static inline union xnn_f32_chw_params xnn_init_f32_chw_params(
   uint32_t width,
   float output_min,
   float output_max)
 {
-  union xnn_f32_spchw_params params;
+  union xnn_f32_chw_params params;
   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
     for (uint32_t i = 0; i < 4; i++) {
       params.sse.min[i] = output_min;
@@ -508,8 +508,8 @@
   return params;
 }
 
-static inline void xnn_update_f32_spchw_params(
-  union xnn_f32_spchw_params* params,
+static inline void xnn_update_f32_chw_params(
+  union xnn_f32_chw_params* params,
   uint32_t width)
 {
   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -547,12 +547,12 @@
   #endif
 }
 
-static inline union xnn_f32_spchw_params xnn_init_scalar_f32_spchw_params(
+static inline union xnn_f32_chw_params xnn_init_scalar_f32_chw_params(
   uint32_t width,
   float output_min,
   float output_max)
 {
-  union xnn_f32_spchw_params params;
+  union xnn_f32_chw_params params;
   params.scalar.min = output_min;
   params.scalar.max = output_max;
   return params;
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 804d7d3..bd0ebe4 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -50,7 +50,7 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 };
 
-union xnn_f32_spchw_params {
+union xnn_f32_chw_params {
   struct {
     float min;
     float max;
@@ -577,7 +577,7 @@
     size_t output_width_stride,
     const union xnn_f32_minmax_params* params);
 
-typedef void (*xnn_conv_hwc2spchw_ukernel_function)(
+typedef void (*xnn_conv_hwc2chw_ukernel_function)(
     size_t input_height,
     size_t input_width,
     size_t output_y_start,
@@ -592,7 +592,7 @@
     size_t output_channel_stride,
     const void* params);
 
-typedef void (*xnn_f32_conv_hwc2spchw_ukernel_function)(
+typedef void (*xnn_f32_conv_hwc2chw_ukernel_function)(
     size_t input_height,
     size_t input_width,
     size_t output_y_start,
@@ -717,7 +717,7 @@
     const uint8_t* t,
     uint8_t* y);
 
-typedef void (*xnn_dwconv_spchw_ukernel_function)(
+typedef void (*xnn_dwconv_chw_ukernel_function)(
     size_t input_height,
     size_t input_width,
     const void* input,
@@ -731,7 +731,7 @@
     size_t output_height_stride,
     const void* params);
 
-typedef void (*xnn_f32_dwconv_spchw_ukernel_function)(
+typedef void (*xnn_f32_dwconv_chw_ukernel_function)(
     size_t input_height,
     size_t input_width,
     const float* input,
@@ -743,7 +743,7 @@
     size_t output_tuple_stride,
     size_t input_height_stride,
     size_t output_height_stride,
-    const union xnn_f32_spchw_params* params);
+    const union xnn_f32_chw_params* params);
 
 typedef void (*xnn_dwconv_unipass_ukernel_function)(
     size_t channels,
@@ -871,14 +871,14 @@
     uint8_t* output,
     const union xnn_q8_avgpool_params* params);
 
-typedef void (*xnn_gavgpool_spchw_ukernel_function)(
+typedef void (*xnn_gavgpool_cw_ukernel_function)(
     size_t elements,
     size_t channels,
     const float* input,
     float* output,
     const void* params);
 
-typedef void (*xnn_f32_gavgpool_spchw_ukernel_function)(
+typedef void (*xnn_f32_gavgpool_cw_ukernel_function)(
     size_t elements,
     size_t channels,
     const float* input,
@@ -1390,8 +1390,8 @@
   uint8_t nr;
 };
 
-struct hwc2spchw_dconv_parameters {
-  xnn_conv_hwc2spchw_ukernel_function ukernel_with_symm_padding;
+struct conv_hwc2chw_parameters {
+  xnn_conv_hwc2chw_ukernel_function ukernel_with_symm_padding;
   // Number of output channels in a tile.
   // This parameter must be passed as is to weight packing function.
   uint8_t output_channel_tile;
@@ -1402,8 +1402,8 @@
   uint8_t output_width_tile;
 };
 
-struct spchw_dwconv_parameters {
-  xnn_dwconv_spchw_ukernel_function ukernel;
+struct dwconv_chw_parameters {
+  xnn_dwconv_chw_ukernel_function ukernel;
   // Number of input width pixels in a tile.
   uint8_t input_width_tile;
   // Number of output width pixels in a tile.
@@ -1413,8 +1413,8 @@
   uint8_t output_height_tile;
 };
 
-struct spchw_gavgpool_parameters {
-  xnn_gavgpool_spchw_ukernel_function ukernel;
+struct gavgpool_cw_parameters {
+  xnn_gavgpool_cw_ukernel_function ukernel;
   // Number of channels in a tile.
   // For best efficiency, micro-kernel must process a multiple of this number of channels in each call.
   uint8_t channel_tile;
@@ -1556,18 +1556,18 @@
     struct spmm_parameters spmm2;
     // Sparse Matrix-Dense Matrix Multiplication (NR=4 block).
     struct spmm_parameters spmm4;
-    // Direct 3x3 stride-2 Convolution with 3 input channels and HWC->SpCHW layout conversion.
-    struct hwc2spchw_dconv_parameters hwc2spchw_dconv3x3c3s2;
-    // Direct 3x3 stride-1 Convolution with padding 1 on left and right in SpCHW layout.
-    struct spchw_dwconv_parameters spchw_dwconv3x3;
-    // Direct 3x3 stride-2 Convolution with padding 1 on left and right in SpCHW layout.
-    struct spchw_dwconv_parameters spchw_dwconv3x3s2;
-    // Direct 5x5 stride-1 Convolution with padding 2 on left and right in SpCHW layout.
-    struct spchw_dwconv_parameters spchw_dwconv5x5;
-    // Direct 5x5 stride-2 Convolution with padding 2 on left and right in SpCHW layout.
-    struct spchw_dwconv_parameters spchw_dwconv5x5s2;
-    // Global Average Pooling in SpCHW layout.
-    struct spchw_gavgpool_parameters spchw_gavgpool;
+    // Direct 3x3 stride-2 Convolution with 3 input channels and HWC->CHW layout conversion.
+    struct conv_hwc2chw_parameters conv_hwc2chw_3x3c3s2;
+    // Direct 3x3 stride-1 Convolution with padding 1 on left and right in CHW layout.
+    struct dwconv_chw_parameters dwconv_chw_3x3;
+    // Direct 3x3 stride-2 Convolution with padding 1 on left and right in CHW layout.
+    struct dwconv_chw_parameters dwconv_chw_3x3s2;
+    // Direct 5x5 stride-1 Convolution with padding 2 on left and right in CHW layout.
+    struct dwconv_chw_parameters dwconv_chw_5x5;
+    // Direct 5x5 stride-2 Convolution with padding 2 on left and right in CHW layout.
+    struct dwconv_chw_parameters dwconv_chw_5x5s2;
+    // Global Average Pooling in CW layout.
+    struct gavgpool_cw_parameters gavgpool_cw;
   } f32;
   struct {
     struct pad_parameters pad;
diff --git a/test/conv-hwc2spchw-microkernel-tester.h b/test/conv-hwc2chw-microkernel-tester.h
similarity index 84%
rename from test/conv-hwc2spchw-microkernel-tester.h
rename to test/conv-hwc2chw-microkernel-tester.h
index 5b2e505..c680c35 100644
--- a/test/conv-hwc2spchw-microkernel-tester.h
+++ b/test/conv-hwc2chw-microkernel-tester.h
@@ -24,14 +24,14 @@
 #include <xnnpack.h>
 
 
-class ConvHWC2SpCHWMicrokernelTester {
+class ConvHWC2CHWMicrokernelTester {
 public:
   enum class Variant {
     Native,
     Scalar,
   };
 
-  inline ConvHWC2SpCHWMicrokernelTester& output_channels_tile(uint32_t output_channels_tile) {
+  inline ConvHWC2CHWMicrokernelTester& output_channels_tile(uint32_t output_channels_tile) {
     this->output_channels_tile_ = output_channels_tile;
     return *this;
   }
@@ -40,7 +40,7 @@
     return this->output_channels_tile_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding(uint32_t padding) {
+  inline ConvHWC2CHWMicrokernelTester& padding(uint32_t padding) {
     this->padding_top_ = padding;
     this->padding_right_ = padding;
     this->padding_bottom_ = padding;
@@ -48,19 +48,19 @@
     return *this;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding_height(uint32_t padding_height) {
+  inline ConvHWC2CHWMicrokernelTester& padding_height(uint32_t padding_height) {
     this->padding_top_ = padding_height;
     this->padding_bottom_ = padding_height;
     return *this;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding_width(uint32_t padding_width) {
+  inline ConvHWC2CHWMicrokernelTester& padding_width(uint32_t padding_width) {
     this->padding_right_ = padding_width;
     this->padding_left_ = padding_width;
     return *this;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding_top(uint32_t padding_top) {
+  inline ConvHWC2CHWMicrokernelTester& padding_top(uint32_t padding_top) {
     this->padding_top_ = padding_top;
     return *this;
   }
@@ -69,7 +69,7 @@
     return this->padding_top_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding_right(uint32_t padding_right) {
+  inline ConvHWC2CHWMicrokernelTester& padding_right(uint32_t padding_right) {
     this->padding_right_ = padding_right;
     return *this;
   }
@@ -78,7 +78,7 @@
     return this->padding_right_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding_bottom(uint32_t padding_bottom) {
+  inline ConvHWC2CHWMicrokernelTester& padding_bottom(uint32_t padding_bottom) {
     this->padding_bottom_ = padding_bottom;
     return *this;
   }
@@ -87,7 +87,7 @@
     return this->padding_bottom_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& padding_left(uint32_t padding_left) {
+  inline ConvHWC2CHWMicrokernelTester& padding_left(uint32_t padding_left) {
     this->padding_left_ = padding_left;
     return *this;
   }
@@ -96,7 +96,7 @@
     return this->padding_left_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& input_size(uint32_t input_height, uint32_t input_width) {
+  inline ConvHWC2CHWMicrokernelTester& input_size(uint32_t input_height, uint32_t input_width) {
     assert(input_height >= 1);
     assert(input_width >= 1);
     this->input_height_ = input_height;
@@ -104,7 +104,7 @@
     return *this;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& input_height(uint32_t input_height) {
+  inline ConvHWC2CHWMicrokernelTester& input_height(uint32_t input_height) {
     assert(input_height >= 1);
     this->input_height_ = input_height;
     return *this;
@@ -114,7 +114,7 @@
     return this->input_height_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& input_width(uint32_t input_width) {
+  inline ConvHWC2CHWMicrokernelTester& input_width(uint32_t input_width) {
     assert(input_width >= 1);
     this->input_width_ = input_width;
     return *this;
@@ -124,7 +124,7 @@
     return this->input_width_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& input_channels(size_t input_channels) {
+  inline ConvHWC2CHWMicrokernelTester& input_channels(size_t input_channels) {
     assert(input_channels >= 1);
     this->input_channels_ = input_channels;
     return *this;
@@ -134,7 +134,7 @@
     return this->input_channels_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& output_channels(size_t output_channels) {
+  inline ConvHWC2CHWMicrokernelTester& output_channels(size_t output_channels) {
     assert(output_channels >= 1);
     this->output_channels_ = output_channels;
     return *this;
@@ -148,7 +148,7 @@
     return output_channels() % output_channels_tile() == 0 ? output_channels() : output_channels() / output_channels_tile() * output_channels_tile() + output_channels_tile();
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& batch_size(size_t batch_size) {
+  inline ConvHWC2CHWMicrokernelTester& batch_size(size_t batch_size) {
     assert(batch_size >= 1);
     this->batch_size_ = batch_size;
     return *this;
@@ -158,14 +158,14 @@
     return this->batch_size_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& kernel_size(uint32_t kernel_size) {
+  inline ConvHWC2CHWMicrokernelTester& kernel_size(uint32_t kernel_size) {
     assert(kernel_size >= 1);
     this->kernel_height_ = kernel_size;
     this->kernel_width_ = kernel_size;
     return *this;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& kernel_height(uint32_t kernel_height) {
+  inline ConvHWC2CHWMicrokernelTester& kernel_height(uint32_t kernel_height) {
     assert(kernel_height >= 1);
     this->kernel_height_ = kernel_height;
     return *this;
@@ -175,7 +175,7 @@
     return this->kernel_height_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& kernel_width(uint32_t kernel_width) {
+  inline ConvHWC2CHWMicrokernelTester& kernel_width(uint32_t kernel_width) {
     assert(kernel_width >= 1);
     this->kernel_width_ = kernel_width;
     return *this;
@@ -185,14 +185,14 @@
     return this->kernel_width_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& subsampling(uint32_t subsampling) {
+  inline ConvHWC2CHWMicrokernelTester& subsampling(uint32_t subsampling) {
     assert(subsampling >= 1);
     this->subsampling_height_ = subsampling;
     this->subsampling_width_ = subsampling;
     return *this;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& subsampling_height(uint32_t subsampling_height) {
+  inline ConvHWC2CHWMicrokernelTester& subsampling_height(uint32_t subsampling_height) {
     assert(subsampling_height >= 1);
     this->subsampling_height_ = subsampling_height;
     return *this;
@@ -202,7 +202,7 @@
     return this->subsampling_height_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& subsampling_width(uint32_t subsampling_width) {
+  inline ConvHWC2CHWMicrokernelTester& subsampling_width(uint32_t subsampling_width) {
     assert(subsampling_width >= 1);
     this->subsampling_width_ = subsampling_width;
     return *this;
@@ -212,7 +212,7 @@
     return this->subsampling_width_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& output_y_start(uint32_t output_y_start) {
+  inline ConvHWC2CHWMicrokernelTester& output_y_start(uint32_t output_y_start) {
     this->output_y_start_ = output_y_start;
     return *this;
   }
@@ -221,7 +221,7 @@
     return this->output_y_start_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& output_y_end(uint32_t output_y_end) {
+  inline ConvHWC2CHWMicrokernelTester& output_y_end(uint32_t output_y_end) {
     this->output_y_end_ = output_y_end;
     return *this;
   }
@@ -260,7 +260,7 @@
     }
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& qmin(uint8_t qmin) {
+  inline ConvHWC2CHWMicrokernelTester& qmin(uint8_t qmin) {
     this->qmin_ = qmin;
     return *this;
   }
@@ -269,7 +269,7 @@
     return this->qmin_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& qmax(uint8_t qmax) {
+  inline ConvHWC2CHWMicrokernelTester& qmax(uint8_t qmax) {
     this->qmax_ = qmax;
     return *this;
   }
@@ -278,7 +278,7 @@
     return this->qmax_;
   }
 
-  inline ConvHWC2SpCHWMicrokernelTester& iterations(size_t iterations) {
+  inline ConvHWC2CHWMicrokernelTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
   }
@@ -287,7 +287,7 @@
     return this->iterations_;
   }
 
-  void Test(xnn_f32_conv_hwc2spchw_ukernel_function conv, Variant variant = Variant::Native) const {
+  void Test(xnn_f32_conv_hwc2chw_ukernel_function conv, Variant variant = Variant::Native) const {
     ASSERT_LT(output_y_start(), output_height());
     ASSERT_LE(output_y_end(), output_height());
     ASSERT_GT(output_y_end(), output_y_start());
diff --git a/test/convolution-nchw.cc b/test/convolution-nchw.cc
index fb1fe68..97315be 100644
--- a/test/convolution-nchw.cc
+++ b/test/convolution-nchw.cc
@@ -385,7 +385,7 @@
     .TestNCHWxF32();
 }
 
-/**************************** DConv 3x3c3s2 HWC->SpCHW path, batched ****************************/
+/**************************** DConv 3x3c3s2 HWC->CHW path, batched ****************************/
 
 TEST(CONVOLUTION_NHWC2NCHW_OP_F32, batched_3x3c3s2) {
   ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
diff --git a/test/dwconv-spchw-microkernel-tester.h b/test/dwconv-chw-microkernel-tester.h
similarity index 83%
rename from test/dwconv-spchw-microkernel-tester.h
rename to test/dwconv-chw-microkernel-tester.h
index a5b4354..aea6ddf 100644
--- a/test/dwconv-spchw-microkernel-tester.h
+++ b/test/dwconv-chw-microkernel-tester.h
@@ -27,14 +27,14 @@
 #include <xnnpack/params.h>
 
 
-class DWConvSpCHWMicrokernelTester {
+class DWConvCHWMicrokernelTester {
  public:
   enum class Variant {
     Native,
     Scalar,
   };
 
-  inline DWConvSpCHWMicrokernelTester& input_tuple_size(uint32_t input_tuple_size) {
+  inline DWConvCHWMicrokernelTester& input_tuple_size(uint32_t input_tuple_size) {
     this->input_tuple_size_ = input_tuple_size;
     return *this;
   }
@@ -43,7 +43,7 @@
     return this->input_tuple_size_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& output_tuple_size(uint32_t output_tuple_size) {
+  inline DWConvCHWMicrokernelTester& output_tuple_size(uint32_t output_tuple_size) {
     this->output_tuple_size_ = output_tuple_size;
     return *this;
   }
@@ -52,7 +52,7 @@
     return this->output_tuple_size_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& padding_left(uint32_t padding_left) {
+  inline DWConvCHWMicrokernelTester& padding_left(uint32_t padding_left) {
     this->padding_left_ = padding_left;
     return *this;
   }
@@ -61,7 +61,7 @@
     return this->padding_left_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& padding_right(uint32_t padding_right) {
+  inline DWConvCHWMicrokernelTester& padding_right(uint32_t padding_right) {
     this->padding_right_ = padding_right;
     return *this;
   }
@@ -70,7 +70,7 @@
     return this->padding_right_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& padding_top(uint32_t padding_top) {
+  inline DWConvCHWMicrokernelTester& padding_top(uint32_t padding_top) {
     this->padding_top_ = padding_top;
     return *this;
   }
@@ -80,7 +80,7 @@
   }
 
 
-  inline DWConvSpCHWMicrokernelTester& padding_bottom(uint32_t padding_bottom) {
+  inline DWConvCHWMicrokernelTester& padding_bottom(uint32_t padding_bottom) {
     this->padding_bottom_ = padding_bottom;
     return *this;
   }
@@ -92,7 +92,7 @@
     return (output_height() - 1) * subsampling() + kernel_height() - padding_top() - padding_bottom();
   }
 
-  inline DWConvSpCHWMicrokernelTester& input_width(uint32_t input_width) {
+  inline DWConvCHWMicrokernelTester& input_width(uint32_t input_width) {
     assert(input_width >= 1);
     this->input_width_ = input_width;
     return *this;
@@ -102,7 +102,7 @@
     return this->input_width_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& subsampling(uint32_t subsampling) {
+  inline DWConvCHWMicrokernelTester& subsampling(uint32_t subsampling) {
     assert(subsampling >= 1);
     this->subsampling_ = subsampling;
     return *this;
@@ -112,7 +112,7 @@
     return this->subsampling_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& kernel_height(uint32_t kernel_height) {
+  inline DWConvCHWMicrokernelTester& kernel_height(uint32_t kernel_height) {
     assert(kernel_height != 0);
     this->kernel_height_ = kernel_height;
     return *this;
@@ -122,7 +122,7 @@
     return this->kernel_height_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& kernel_width(uint32_t kernel_width) {
+  inline DWConvCHWMicrokernelTester& kernel_width(uint32_t kernel_width) {
     assert(kernel_width != 0);
     this->kernel_width_ = kernel_width;
     return *this;
@@ -136,7 +136,7 @@
     return kernel_height() * kernel_width();
   }
 
-  inline DWConvSpCHWMicrokernelTester& output_height(uint32_t output_height) {
+  inline DWConvCHWMicrokernelTester& output_height(uint32_t output_height) {
     assert(output_height >= 1);
     this->output_height_ = output_height;
     return *this;
@@ -155,7 +155,7 @@
     }
   }
 
-  inline DWConvSpCHWMicrokernelTester& input_tuple_stride(uint32_t input_tuple_stride) {
+  inline DWConvCHWMicrokernelTester& input_tuple_stride(uint32_t input_tuple_stride) {
     assert(input_tuple_stride != 0);
     this->input_tuple_stride_ = input_tuple_stride;
     return *this;
@@ -169,7 +169,7 @@
     }
   }
 
-  inline DWConvSpCHWMicrokernelTester& output_tuple_stride(uint32_t output_tuple_stride) {
+  inline DWConvCHWMicrokernelTester& output_tuple_stride(uint32_t output_tuple_stride) {
     assert(output_tuple_stride != 0);
     this->output_tuple_stride_ = output_tuple_stride;
     return *this;
@@ -183,7 +183,7 @@
     }
   }
 
-  inline DWConvSpCHWMicrokernelTester& input_width_stride(uint32_t input_width_stride) {
+  inline DWConvCHWMicrokernelTester& input_width_stride(uint32_t input_width_stride) {
     assert(input_width_stride != 0);
     this->input_width_stride_ = input_width_stride;
     return *this;
@@ -197,7 +197,7 @@
     }
   }
 
-  inline DWConvSpCHWMicrokernelTester& output_width_stride(uint32_t output_width_stride) {
+  inline DWConvCHWMicrokernelTester& output_width_stride(uint32_t output_width_stride) {
     assert(output_width_stride != 0);
     this->output_width_stride_ = output_width_stride;
     return *this;
@@ -211,7 +211,7 @@
     }
   }
 
-  inline DWConvSpCHWMicrokernelTester& qmin(uint8_t qmin) {
+  inline DWConvCHWMicrokernelTester& qmin(uint8_t qmin) {
     this->qmin_ = qmin;
     return *this;
   }
@@ -220,7 +220,7 @@
     return this->qmin_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& qmax(uint8_t qmax) {
+  inline DWConvCHWMicrokernelTester& qmax(uint8_t qmax) {
     this->qmax_ = qmax;
     return *this;
   }
@@ -229,7 +229,7 @@
     return this->qmax_;
   }
 
-  inline DWConvSpCHWMicrokernelTester& iterations(size_t iterations) {
+  inline DWConvCHWMicrokernelTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
   }
@@ -238,7 +238,7 @@
     return this->iterations_;
   }
 
-  void Test(xnn_f32_dwconv_spchw_ukernel_function dwconv, Variant variant = Variant::Native) const {
+  void Test(xnn_f32_dwconv_chw_ukernel_function dwconv, Variant variant = Variant::Native) const {
     ASSERT_EQ(0, input_tuple_stride() % input_tuple_size());
     ASSERT_EQ(0, output_tuple_stride() % output_tuple_size());
 
@@ -285,13 +285,13 @@
       const float output_max = accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
 
       // Prepare output parameters.
-      xnn_f32_spchw_params spchw_params = { };
+      xnn_f32_chw_params chw_params = { };
       switch (variant) {
         case Variant::Native:
-          spchw_params = xnn_init_f32_spchw_params(input_width(), output_min, output_max);
+          chw_params = xnn_init_f32_chw_params(input_width(), output_min, output_max);
           break;
         case Variant::Scalar:
-          spchw_params = xnn_init_scalar_f32_spchw_params(input_width(), output_min, output_max);
+          chw_params = xnn_init_scalar_f32_chw_params(input_width(), output_min, output_max);
           break;
       }
 
@@ -307,7 +307,7 @@
         padding_top(),
         input_tuple_stride() * sizeof(float), output_tuple_stride() * sizeof(float),
         input_width_stride() * sizeof(float), output_width_stride() * sizeof(float),
-        &spchw_params);
+        &chw_params);
 
       // Verify results.
       for (size_t y = 0; y < output_height(); y++) {
diff --git a/test/f32-conv-hwc2spchw.cc b/test/f32-conv-hwc2chw.cc
similarity index 69%
rename from test/f32-conv-hwc2spchw.cc
rename to test/f32-conv-hwc2chw.cc
index 7227ac3..5cba25f 100644
--- a/test/f32-conv-hwc2spchw.cc
+++ b/test/f32-conv-hwc2chw.cc
@@ -9,13 +9,13 @@
 #include <xnnpack/isa-checks.h>
 
 #include <xnnpack/conv.h>
-#include "conv-hwc2spchw-microkernel-tester.h"
+#include "conv-hwc2chw-microkernel-tester.h"
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_eq_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    ConvHWC2SpCHWMicrokernelTester()
+    ConvHWC2CHWMicrokernelTester()
       .kernel_size(3)
       .subsampling(2)
       .padding_width(1)
@@ -24,13 +24,13 @@
       .output_channels(4)
       .input_width(4)
       .input_height(3)
-      .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+      .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_div_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -39,14 +39,14 @@
         .output_channels(4)
         .input_width(input_width)
         .input_height(3)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_lt_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_lt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -55,14 +55,14 @@
         .output_channels(4)
         .input_width(input_width)
         .input_height(3)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_gt_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, input_width_gt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -71,15 +71,15 @@
         .output_channels(4)
         .input_width(input_width)
         .input_height(3)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, output_channels_lt_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, output_channels_lt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -88,16 +88,16 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(3)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, output_channels_div_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, output_channels_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -106,16 +106,16 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(3)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, output_channels_gt_4) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, output_channels_gt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -124,17 +124,17 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(3)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_height_lt_3) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, input_height_lt_3) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_height = 1; input_height < 3; input_height++) {
       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
         for (size_t input_width = 1; input_width < 32; input_width += 7) {
-          ConvHWC2SpCHWMicrokernelTester()
+          ConvHWC2CHWMicrokernelTester()
             .kernel_size(3)
             .subsampling(2)
             .padding(1)
@@ -143,18 +143,18 @@
             .output_channels(output_channels)
             .input_width(input_width)
             .input_height(input_height)
-            .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+            .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
         }
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, input_height_gt_3) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, input_height_gt_3) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_height = 4; input_height <= 9; input_height++) {
       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
         for (size_t input_width = 1; input_width < 32; input_width += 7) {
-          ConvHWC2SpCHWMicrokernelTester()
+          ConvHWC2CHWMicrokernelTester()
             .kernel_size(3)
             .subsampling(2)
             .padding_width(1)
@@ -163,18 +163,18 @@
             .output_channels(output_channels)
             .input_width(input_width)
             .input_height(input_height)
-            .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+            .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
         }
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, padding_top) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, padding_top) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
         for (size_t input_width = 1; input_width < 32; input_width += 7) {
-          ConvHWC2SpCHWMicrokernelTester()
+          ConvHWC2CHWMicrokernelTester()
             .kernel_size(3)
             .subsampling(2)
             .padding_width(1)
@@ -184,18 +184,18 @@
             .output_channels(output_channels)
             .input_width(input_width)
             .input_height(9)
-            .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+            .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
         }
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, padding_bottom) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, padding_bottom) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
         for (size_t input_width = 1; input_width < 32; input_width += 7) {
-          ConvHWC2SpCHWMicrokernelTester()
+          ConvHWC2CHWMicrokernelTester()
             .kernel_size(3)
             .subsampling(2)
             .padding_width(1)
@@ -205,18 +205,18 @@
             .output_channels(output_channels)
             .input_width(input_width)
             .input_height(9)
-            .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+            .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
         }
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, output_y_start) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, output_y_start) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
         for (size_t input_width = 1; input_width < 32; input_width += 7) {
-          ConvHWC2SpCHWMicrokernelTester()
+          ConvHWC2CHWMicrokernelTester()
             .kernel_size(3)
             .subsampling(2)
             .padding_width(1)
@@ -226,18 +226,18 @@
             .input_width(input_width)
             .input_height(9)
             .output_y_start(output_y_start)
-            .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+            .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
         }
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, output_y_end) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, output_y_end) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
         for (size_t input_width = 1; input_width < 32; input_width += 7) {
-          ConvHWC2SpCHWMicrokernelTester()
+          ConvHWC2CHWMicrokernelTester()
             .kernel_size(3)
             .subsampling(2)
             .padding_width(1)
@@ -247,17 +247,17 @@
             .input_width(input_width)
             .input_height(9)
             .output_y_end(output_y_end)
-            .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+            .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
         }
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, qmin) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -267,16 +267,16 @@
           .input_width(input_width)
           .input_height(6)
           .qmin(128)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
       }
     }
   }
 
-  TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__NEONFMA_2X2, qmax) {
+  TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__NEONFMA_2X2, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -286,14 +286,14 @@
           .input_width(input_width)
           .input_height(6)
           .qmax(128)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2);
       }
     }
   }
 #endif  // XNN_ARCH_ARM64
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, input_width_eq_1) {
-  ConvHWC2SpCHWMicrokernelTester()
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, input_width_eq_1) {
+  ConvHWC2CHWMicrokernelTester()
     .kernel_size(3)
     .subsampling(2)
     .padding_width(1)
@@ -302,13 +302,13 @@
     .output_channels(4)
     .input_width(4)
     .input_height(3)
-    .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
 }
 
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, input_width_gt_1) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, input_width_gt_1) {
   for (size_t input_width = 2; input_width < 33; input_width++) {
-    ConvHWC2SpCHWMicrokernelTester()
+    ConvHWC2CHWMicrokernelTester()
       .kernel_size(3)
       .subsampling(2)
       .padding_width(1)
@@ -317,14 +317,14 @@
       .output_channels(4)
       .input_width(input_width)
       .input_height(3)
-      .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, output_channels_lt_4) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, output_channels_lt_4) {
   for (size_t output_channels = 1; output_channels < 4; output_channels++) {
     for (size_t input_width = 1; input_width < 32; input_width += 7) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -333,15 +333,15 @@
         .output_channels(output_channels)
         .input_width(input_width)
         .input_height(3)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, output_channels_div_4) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, output_channels_div_4) {
   for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
     for (size_t input_width = 1; input_width < 32; input_width += 7) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -350,15 +350,15 @@
         .output_channels(output_channels)
         .input_width(input_width)
         .input_height(3)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, output_channels_gt_4) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, output_channels_gt_4) {
   for (size_t output_channels = 5; output_channels < 8; output_channels++) {
     for (size_t input_width = 1; input_width < 32; input_width += 7) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -367,16 +367,16 @@
         .output_channels(output_channels)
         .input_width(input_width)
         .input_height(3)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, input_height_lt_3) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, input_height_lt_3) {
   for (size_t input_height = 1; input_height < 3; input_height++) {
     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding(1)
@@ -385,17 +385,17 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(input_height)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, input_height_gt_3) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, input_height_gt_3) {
   for (size_t input_height = 4; input_height <= 9; input_height++) {
     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -404,17 +404,17 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(input_height)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, padding_top) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, padding_top) {
   for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -424,17 +424,17 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(9)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, padding_bottom) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, padding_bottom) {
   for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -444,17 +444,17 @@
           .output_channels(output_channels)
           .input_width(input_width)
           .input_height(9)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, output_y_start) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, output_y_start) {
   for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -464,17 +464,17 @@
           .input_width(input_width)
           .input_height(9)
           .output_y_start(output_y_start)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, output_y_end) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, output_y_end) {
   for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
       for (size_t input_width = 1; input_width < 32; input_width += 7) {
-        ConvHWC2SpCHWMicrokernelTester()
+        ConvHWC2CHWMicrokernelTester()
           .kernel_size(3)
           .subsampling(2)
           .padding_width(1)
@@ -484,16 +484,16 @@
           .input_width(input_width)
           .input_height(9)
           .output_y_end(output_y_end)
-          .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+          .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
       }
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, qmin) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, qmin) {
   for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
     for (size_t input_width = 1; input_width < 32; input_width += 7) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -503,15 +503,15 @@
         .input_width(input_width)
         .input_height(6)
         .qmin(128)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_CONV_HWC2SPCHW_3X3S2P1C3X4__SCALAR_1X1, qmax) {
+TEST(F32_CONV_HWC2CHW_3X3S2P1C3X4__SCALAR_1X1, qmax) {
   for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
     for (size_t input_width = 1; input_width < 32; input_width += 7) {
-      ConvHWC2SpCHWMicrokernelTester()
+      ConvHWC2CHWMicrokernelTester()
         .kernel_size(3)
         .subsampling(2)
         .padding_width(1)
@@ -521,7 +521,7 @@
         .input_width(input_width)
         .input_height(6)
         .qmax(128)
-        .Test(xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2SpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1, ConvHWC2CHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
diff --git a/test/f32-dwconv-spchw.cc b/test/f32-dwconv-chw.cc
similarity index 70%
rename from test/f32-dwconv-spchw.cc
rename to test/f32-dwconv-chw.cc
index 173c5e0..c7434d4 100644
--- a/test/f32-dwconv-spchw.cc
+++ b/test/f32-dwconv-chw.cc
@@ -9,12 +9,12 @@
 #include <xnnpack/isa-checks.h>
 
 #include <xnnpack/dwconv.h>
-#include "dwconv-spchw-microkernel-tester.h"
+#include "dwconv-chw-microkernel-tester.h"
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_eq_4) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, input_width_eq_4) {
     TEST_REQUIRES_X86_SSE;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -25,13 +25,13 @@
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_lt_4) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, input_width_lt_4) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -42,14 +42,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_gt_4) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, input_width_gt_4) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -60,14 +60,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_div_4) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, input_width_div_4) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -78,14 +78,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_stride) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, input_width_stride) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -97,14 +97,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_tuple_stride) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, input_tuple_stride) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -117,15 +117,15 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, output_height_gt_1) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, output_height_gt_1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -136,15 +136,15 @@
           .kernel_height(3)
           .kernel_width(3)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+          .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, output_width_stride) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, output_width_stride) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -156,14 +156,14 @@
         .kernel_width(3)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, output_tuple_stride) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, output_tuple_stride) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -176,14 +176,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__SSE, chw_layout) {
+  TEST(F32_DWCONV_CHW_3X3P1__SSE, chw_layout) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -196,15 +196,15 @@
         .kernel_width(3)
         .output_height(5)
         .output_width_stride(input_width)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__sse);
     }
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_eq_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_eq_4_pad0) {
     TEST_REQUIRES_X86_SSE;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -216,13 +216,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_lt_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_lt_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -234,14 +234,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_gt_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_gt_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -253,14 +253,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_div_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_div_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -272,14 +272,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -292,14 +292,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_tuple_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_tuple_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -313,15 +313,15 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_height_gt_1_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, output_height_gt_1_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -333,15 +333,15 @@
           .kernel_width(3)
           .subsampling(2)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+          .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_width_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, output_width_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -354,14 +354,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_tuple_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, output_tuple_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -375,14 +375,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, chw_layout_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, chw_layout_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -396,13 +396,13 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride((input_width - 1) / 2 + 1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_eq_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_eq_4_pad1) {
     TEST_REQUIRES_X86_SSE;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -414,13 +414,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_lt_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_lt_4_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -432,14 +432,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_gt_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_gt_4_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -451,14 +451,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_div_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_div_4_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -470,14 +470,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_width_stride_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -490,14 +490,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, input_tuple_stride_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -511,15 +511,15 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_height_gt_1_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, output_height_gt_1_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -531,15 +531,15 @@
           .kernel_width(3)
           .subsampling(2)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+          .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, output_width_stride_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -552,14 +552,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, output_tuple_stride_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -573,14 +573,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, chw_layout_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__SSE, chw_layout_pad1) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -594,16 +594,16 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride((input_width - 1) / 2 + 1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__sse);
     }
   }
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_eq_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, input_width_eq_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -614,13 +614,13 @@
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_lt_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, input_width_lt_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -631,14 +631,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_gt_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, input_width_gt_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -649,14 +649,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_div_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, input_width_div_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -667,14 +667,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, input_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -686,14 +686,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, input_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -706,15 +706,15 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_height_gt_1_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, output_height_gt_1_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 2; output_height <= 5; output_height++) {
       for (size_t input_width = 8; input_width < 9; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -725,15 +725,15 @@
           .kernel_height(3)
           .kernel_width(3)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, output_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -745,14 +745,14 @@
         .kernel_width(3)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, output_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -765,14 +765,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, chw_layout_pad1) {
+  TEST(F32_DWCONV_CHW_3X3P1__NEONFMA, chw_layout_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -785,16 +785,16 @@
         .kernel_width(3)
         .output_height(5)
         .output_width_stride(input_width)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__neonfma);
     }
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_eq_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_eq_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -806,12 +806,12 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_eq_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_eq_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -823,13 +823,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_lt_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_lt_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -841,14 +841,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_lt_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_lt_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -860,14 +860,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_gt_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_gt_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -879,14 +879,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_gt_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_gt_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -898,14 +898,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_div_4_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_div_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -917,14 +917,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_div_4_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_div_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -936,14 +936,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -956,14 +956,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -976,15 +976,15 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_tuple_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_tuple_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -998,14 +998,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, input_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1019,15 +1019,15 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_height_gt_1_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, output_height_gt_1_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -1039,16 +1039,16 @@
           .kernel_width(3)
           .subsampling(2)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_height_gt_1_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, output_height_gt_1_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -1060,15 +1060,15 @@
           .kernel_width(3)
           .subsampling(2)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_width_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, output_width_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1081,14 +1081,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, output_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1101,14 +1101,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_tuple_stride_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, output_tuple_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1122,14 +1122,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, output_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1143,14 +1143,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, chw_layout_pad0) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, chw_layout_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1164,14 +1164,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride((input_width - 1) / 2 + 1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, chw_layout_pad1) {
+  TEST(F32_DWCONV_CHW_3X3S2P1__NEONFMA, chw_layout_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1185,16 +1185,16 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride((input_width - 1) / 2 + 1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__neonfma);
     }
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_eq_4_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, input_width_eq_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(4)
@@ -1205,13 +1205,13 @@
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_lt_4_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, input_width_lt_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1222,14 +1222,14 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_gt_4_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, input_width_gt_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1240,14 +1240,14 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_div_4_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, input_width_div_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1258,14 +1258,14 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, input_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1277,14 +1277,14 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_tuple_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, input_tuple_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1297,14 +1297,14 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_height_eq_2_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, output_height_eq_2_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1315,15 +1315,15 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(2)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_height_gt_2_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, output_height_gt_2_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 3; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -1334,15 +1334,15 @@
           .kernel_height(5)
           .kernel_width(5)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_width_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, output_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1354,14 +1354,14 @@
         .kernel_width(5)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_tuple_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, output_tuple_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1374,15 +1374,15 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, chw_layout_pad2) {
+  TEST(F32_DWCONV_CHW_5X5P2__NEONFMA, chw_layout_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       for (size_t output_height = 1; output_height < 32; output_height += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -1395,7 +1395,7 @@
           .kernel_width(5)
           .output_height(5)
           .output_width_stride(input_width)
-          .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__neonfma);
       }
     }
   }
@@ -1403,9 +1403,9 @@
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_eq_8_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_eq_8_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(8)
@@ -1417,12 +1417,12 @@
       .kernel_width(5)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_eq_8_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_eq_8_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(4)
       .output_tuple_size(4)
       .input_width(8)
@@ -1434,13 +1434,13 @@
       .kernel_width(5)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_lt_8_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_lt_8_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1452,14 +1452,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_lt_8_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_lt_8_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 8; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1471,14 +1471,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_gt_8_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_gt_8_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 16; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1490,14 +1490,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_gt_8_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_gt_8_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 16; input_width++) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1509,14 +1509,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_div_4_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_div_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 16; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1528,14 +1528,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_div_4_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_div_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 16; input_width < 32; input_width += 4) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1547,14 +1547,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1567,14 +1567,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1587,14 +1587,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1608,14 +1608,14 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_tuple_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, input_tuple_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1629,15 +1629,15 @@
         .kernel_width(5)
         .subsampling(2)
         .output_height(1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_height_gt_1_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, output_height_gt_1_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 3; output_height < 4; output_height++) {
       for (size_t input_width = 4; input_width < 5; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -1649,16 +1649,16 @@
           .kernel_width(5)
           .subsampling(2)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_height_gt_1_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, output_height_gt_1_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 3; output_height < 4; output_height++) {
       for (size_t input_width = 4; input_width < 5; input_width += 3) {
-        DWConvSpCHWMicrokernelTester()
+        DWConvCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
@@ -1670,15 +1670,15 @@
           .kernel_width(5)
           .subsampling(2)
           .output_height(output_height)
-          .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+          .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
       }
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_width_stride_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, output_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1691,14 +1691,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_width_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, output_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1711,14 +1711,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride(36)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_tuple_stride_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, output_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1732,14 +1732,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_tuple_stride_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, output_tuple_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1753,14 +1753,14 @@
         .output_height(5)
         .output_width_stride(4)
         .output_tuple_stride(5 * 4)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, chw_layout_pad1) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, chw_layout_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 1) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1774,14 +1774,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride((input_width - 1) / 2 + 1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, chw_layout_pad2) {
+  TEST(F32_DWCONV_CHW_5X5S2P2__NEONFMA, chw_layout_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 1) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(4)
         .output_tuple_size(4)
         .input_width(input_width)
@@ -1795,14 +1795,14 @@
         .subsampling(2)
         .output_height(5)
         .output_width_stride((input_width - 1) / 2 + 1)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__neonfma);
     }
   }
 #endif  // XNN_ARCH_ARM64
 
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, input_width_eq_1) {
-  DWConvSpCHWMicrokernelTester()
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, input_width_eq_1) {
+  DWConvCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
@@ -1813,12 +1813,12 @@
     .kernel_height(3)
     .kernel_width(3)
     .output_height(1)
-    .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, input_width_gt_1) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, input_width_gt_1) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1829,13 +1829,13 @@
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, input_width_stride) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, input_width_stride) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1847,13 +1847,13 @@
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, input_tuple_stride) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, input_tuple_stride) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1866,14 +1866,14 @@
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, output_height_gt_1) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, output_height_gt_1) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(1)
         .output_tuple_size(1)
         .input_width(input_width)
@@ -1884,14 +1884,14 @@
         .kernel_height(3)
         .kernel_width(3)
         .output_height(output_height)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, output_width_stride) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, output_width_stride) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1903,13 +1903,13 @@
       .kernel_width(3)
       .output_height(5)
       .output_width_stride(36)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, output_tuple_stride) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, output_tuple_stride) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1922,13 +1922,13 @@
       .output_height(5)
       .output_width_stride(4)
       .output_tuple_stride(5 * 4)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, chw_layout) {
+TEST(F32_DWCONV_CHW_3X3P1__SCALAR, chw_layout) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1941,12 +1941,12 @@
       .kernel_width(3)
       .output_height(5)
       .output_width_stride(input_width)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_eq_1_pad0) {
-  DWConvSpCHWMicrokernelTester()
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_width_eq_1_pad0) {
+  DWConvCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
@@ -1958,12 +1958,12 @@
     .kernel_width(3)
     .subsampling(2)
     .output_height(1)
-    .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_gt_1_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_width_gt_1_pad0) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1975,13 +1975,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_stride_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_width_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -1994,13 +1994,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_tuple_stride_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_tuple_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2014,14 +2014,14 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_height_gt_1_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, output_height_gt_1_pad0) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(1)
         .output_tuple_size(1)
         .input_width(input_width)
@@ -2033,14 +2033,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(output_height)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_width_stride_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, output_width_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2053,13 +2053,13 @@
       .subsampling(2)
       .output_height(5)
       .output_width_stride(36)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_tuple_stride_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, output_tuple_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2073,13 +2073,13 @@
       .output_height(5)
       .output_width_stride(4)
       .output_tuple_stride(5 * 4)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, chw_layout_pad0) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, chw_layout_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2093,12 +2093,12 @@
       .subsampling(2)
       .output_height(5)
       .output_width_stride(input_width)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_eq_1_pad1) {
-  DWConvSpCHWMicrokernelTester()
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_width_eq_1_pad1) {
+  DWConvCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
@@ -2110,12 +2110,12 @@
     .kernel_width(3)
     .subsampling(2)
     .output_height(1)
-    .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_gt_1_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_width_gt_1_pad1) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2127,13 +2127,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_stride_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_width_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2146,13 +2146,13 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_tuple_stride_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, input_tuple_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2166,14 +2166,14 @@
       .kernel_width(3)
       .subsampling(2)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_height_gt_1_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, output_height_gt_1_pad1) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(1)
         .output_tuple_size(1)
         .input_width(input_width)
@@ -2185,14 +2185,14 @@
         .kernel_width(3)
         .subsampling(2)
         .output_height(output_height)
-        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_width_stride_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, output_width_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2205,13 +2205,13 @@
       .subsampling(2)
       .output_height(5)
       .output_width_stride(36)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_tuple_stride_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, output_tuple_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2225,13 +2225,13 @@
       .output_height(5)
       .output_width_stride(4)
       .output_tuple_stride(5 * 4)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, chw_layout_pad1) {
+TEST(F32_DWCONV_CHW_3X3S2P1__SCALAR, chw_layout_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2245,12 +2245,12 @@
       .subsampling(2)
       .output_height(5)
       .output_width_stride(input_width)
-      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_3x3s2p1__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_eq_1_pad2) {
-  DWConvSpCHWMicrokernelTester()
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, input_width_eq_1_pad2) {
+  DWConvCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
@@ -2261,12 +2261,12 @@
     .kernel_height(5)
     .kernel_width(5)
     .output_height(1)
-    .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_gt_1_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, input_width_gt_1_pad2) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2277,13 +2277,13 @@
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, input_width_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2295,13 +2295,13 @@
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_tuple_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, input_tuple_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2314,14 +2314,14 @@
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_height_gt_1_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, output_height_gt_1_pad2) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(1)
         .output_tuple_size(1)
         .input_width(input_width)
@@ -2332,14 +2332,14 @@
         .kernel_height(5)
         .kernel_width(5)
         .output_height(output_height)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_width_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, output_width_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2351,13 +2351,13 @@
       .kernel_width(5)
       .output_height(5)
       .output_width_stride(36)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_tuple_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, output_tuple_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2370,13 +2370,13 @@
       .output_height(5)
       .output_width_stride(4)
       .output_tuple_stride(5 * 4)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, chw_layout_pad2) {
+TEST(F32_DWCONV_CHW_5X5P2__SCALAR, chw_layout_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2389,12 +2389,12 @@
       .kernel_width(5)
       .output_height(5)
       .output_width_stride(input_width)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_eq_1_pad1) {
-  DWConvSpCHWMicrokernelTester()
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_width_eq_1_pad1) {
+  DWConvCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
@@ -2406,12 +2406,12 @@
     .kernel_width(5)
     .output_height(1)
     .subsampling(2)
-    .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_gt_1_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_width_gt_1_pad1) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2423,13 +2423,13 @@
       .kernel_width(5)
       .output_height(1)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_stride_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_width_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2442,13 +2442,13 @@
       .kernel_width(5)
       .output_height(1)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_tuple_stride_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_tuple_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2462,14 +2462,14 @@
       .kernel_width(5)
       .output_height(1)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_height_gt_1_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, output_height_gt_1_pad1) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(1)
         .output_tuple_size(1)
         .input_width(input_width)
@@ -2481,14 +2481,14 @@
         .kernel_width(5)
         .output_height(output_height)
         .subsampling(2)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_width_stride_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, output_width_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2501,13 +2501,13 @@
       .output_height(5)
       .output_width_stride(36)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_tuple_stride_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, output_tuple_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2521,13 +2521,13 @@
       .output_width_stride(4)
       .output_tuple_stride(5 * 4)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, chw_layout_pad1) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, chw_layout_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2541,12 +2541,12 @@
       .output_height(5)
       .output_width_stride(input_width)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_eq_1_pad2) {
-  DWConvSpCHWMicrokernelTester()
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_width_eq_1_pad2) {
+  DWConvCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
@@ -2558,12 +2558,12 @@
     .kernel_width(5)
     .output_height(1)
     .subsampling(2)
-    .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_gt_1_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_width_gt_1_pad2) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2575,13 +2575,13 @@
       .kernel_width(5)
       .output_height(1)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_width_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2594,13 +2594,13 @@
       .kernel_width(5)
       .output_height(1)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_tuple_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, input_tuple_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2614,14 +2614,14 @@
       .kernel_width(5)
       .output_height(1)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_height_gt_1_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, output_height_gt_1_pad2) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
-      DWConvSpCHWMicrokernelTester()
+      DWConvCHWMicrokernelTester()
         .input_tuple_size(1)
         .output_tuple_size(1)
         .input_width(input_width)
@@ -2633,14 +2633,14 @@
         .kernel_width(5)
         .output_height(output_height)
         .subsampling(2)
-        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+        .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
     }
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_width_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, output_width_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2653,13 +2653,13 @@
       .output_height(5)
       .output_width_stride(36)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_tuple_stride_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, output_tuple_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2673,13 +2673,13 @@
       .output_width_stride(4)
       .output_tuple_stride(5 * 4)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, chw_layout_pad2) {
+TEST(F32_DWCONV_CHW_5X5S2P2__SCALAR, chw_layout_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
-    DWConvSpCHWMicrokernelTester()
+    DWConvCHWMicrokernelTester()
       .input_tuple_size(1)
       .output_tuple_size(1)
       .input_width(input_width)
@@ -2693,6 +2693,6 @@
       .output_height(5)
       .output_width_stride(input_width)
       .subsampling(2)
-      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+      .Test(xnn_f32_dwconv_chw_ukernel_5x5s2p2__scalar, DWConvCHWMicrokernelTester::Variant::Scalar);
   }
 }
diff --git a/test/f32-gavgpool-cw.cc b/test/f32-gavgpool-cw.cc
new file mode 100644
index 0000000..6f12f07
--- /dev/null
+++ b/test/f32-gavgpool-cw.cc
@@ -0,0 +1,276 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/gavgpool.h>
+#include "gavgpool-cw-microkernel-tester.h"
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_GAVGPOOL_CW__NEON_X4, elements_eq_4) {
+    TEST_REQUIRES_ARM_NEON;
+    GAvgPoolCWMicrokernelTester()
+      .elements(4)
+      .channels(4)
+      .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, elements_div_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t elements = 8; elements < 32; elements += 4) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, elements_lt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t elements = 1; elements < 4; elements++) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, elements_gt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t elements = 5; elements < 8; elements++) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, channels_lt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels < 4; channels++) {
+      for (size_t elements = 1; elements < 16; elements += 3) {
+        GAvgPoolCWMicrokernelTester()
+          .elements(elements)
+          .channels(channels)
+          .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+      }
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, channels_gt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 5; channels < 8; channels++) {
+      for (size_t elements = 1; elements < 16; elements += 3) {
+        GAvgPoolCWMicrokernelTester()
+          .elements(elements)
+          .channels(channels)
+          .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+      }
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, channels_div_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 8; channels <= 16; channels += 4) {
+      for (size_t elements = 1; elements < 16; elements += 3) {
+        GAvgPoolCWMicrokernelTester()
+          .elements(elements)
+          .channels(channels)
+          .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+      }
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t elements = 1; elements < 16; elements += 3) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .qmin(128)
+        .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__NEON_X4, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t elements = 1; elements < 16; elements += 3) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .qmax(128)
+        .Test(xnn_f32_gavgpool_cw_ukernel__neon_x4);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_GAVGPOOL_CW__SSE_X4, elements_eq_4) {
+    TEST_REQUIRES_X86_SSE;
+    GAvgPoolCWMicrokernelTester()
+      .elements(4)
+      .channels(4)
+      .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, elements_div_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t elements = 8; elements < 32; elements += 4) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, elements_lt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t elements = 1; elements < 4; elements++) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, elements_gt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t elements = 5; elements < 8; elements++) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, channels_lt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels < 4; channels++) {
+      for (size_t elements = 1; elements < 16; elements += 3) {
+        GAvgPoolCWMicrokernelTester()
+          .elements(elements)
+          .channels(channels)
+          .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+      }
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, channels_gt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 5; channels < 8; channels++) {
+      for (size_t elements = 1; elements < 16; elements += 3) {
+        GAvgPoolCWMicrokernelTester()
+          .elements(elements)
+          .channels(channels)
+          .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+      }
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, channels_div_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 8; channels <= 16; channels += 4) {
+      for (size_t elements = 1; elements < 16; elements += 3) {
+        GAvgPoolCWMicrokernelTester()
+          .elements(elements)
+          .channels(channels)
+          .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+      }
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t elements = 1; elements < 16; elements += 3) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .qmin(128)
+        .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+    }
+  }
+
+  TEST(F32_GAVGPOOL_CW__SSE_X4, qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t elements = 1; elements < 16; elements += 3) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(4)
+        .qmax(128)
+        .Test(xnn_f32_gavgpool_cw_ukernel__sse_x4);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, elements_eq_4) {
+  GAvgPoolCWMicrokernelTester()
+    .elements(4)
+    .channels(1)
+    .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, elements_div_4) {
+  for (size_t elements = 8; elements < 32; elements += 4) {
+    GAvgPoolCWMicrokernelTester()
+      .elements(elements)
+      .channels(1)
+      .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, elements_lt_4) {
+  for (size_t elements = 1; elements < 4; elements++) {
+    GAvgPoolCWMicrokernelTester()
+      .elements(elements)
+      .channels(1)
+      .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, elements_gt_4) {
+  for (size_t elements = 5; elements < 8; elements++) {
+    GAvgPoolCWMicrokernelTester()
+      .elements(elements)
+      .channels(1)
+      .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, channels_gt_1) {
+  for (size_t channels = 2; channels < 5; channels++) {
+    for (size_t elements = 1; elements < 16; elements += 3) {
+      GAvgPoolCWMicrokernelTester()
+        .elements(elements)
+        .channels(channels)
+        .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, qmin) {
+  for (size_t elements = 1; elements < 16; elements += 3) {
+    GAvgPoolCWMicrokernelTester()
+      .elements(elements)
+      .channels(4)
+      .qmin(128)
+      .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_GAVGPOOL_CW__SCALAR_X1, qmax) {
+  for (size_t elements = 1; elements < 16; elements += 3) {
+    GAvgPoolCWMicrokernelTester()
+      .elements(elements)
+      .channels(4)
+      .qmax(128)
+      .Test(xnn_f32_gavgpool_cw_ukernel__scalar_x1, GAvgPoolCWMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-gavgpool-spchw.cc b/test/f32-gavgpool-spchw.cc
deleted file mode 100644
index b2b2b40..0000000
--- a/test/f32-gavgpool-spchw.cc
+++ /dev/null
@@ -1,276 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <gtest/gtest.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/isa-checks.h>
-
-#include <xnnpack/gavgpool.h>
-#include "gavgpool-spchw-microkernel-tester.h"
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, elements_eq_4) {
-    TEST_REQUIRES_ARM_NEON;
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(4)
-      .channels(4)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, elements_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t elements = 8; elements < 32; elements += 4) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, elements_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t elements = 1; elements < 4; elements++) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, elements_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t elements = 5; elements < 8; elements++) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, channels_lt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 1; channels < 4; channels++) {
-      for (size_t elements = 1; elements < 16; elements += 3) {
-        GAvgPoolSpCHWMicrokernelTester()
-          .elements(elements)
-          .channels(channels)
-          .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-      }
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, channels_gt_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 5; channels < 8; channels++) {
-      for (size_t elements = 1; elements < 16; elements += 3) {
-        GAvgPoolSpCHWMicrokernelTester()
-          .elements(elements)
-          .channels(channels)
-          .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-      }
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, channels_div_4) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t channels = 8; channels <= 16; channels += 4) {
-      for (size_t elements = 1; elements < 16; elements += 3) {
-        GAvgPoolSpCHWMicrokernelTester()
-          .elements(elements)
-          .channels(channels)
-          .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-      }
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, qmin) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t elements = 1; elements < 16; elements += 3) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .qmin(128)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__NEON_X4, qmax) {
-    TEST_REQUIRES_ARM_NEON;
-    for (size_t elements = 1; elements < 16; elements += 3) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .qmax(128)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__neon_x4);
-    }
-  }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, elements_eq_4) {
-    TEST_REQUIRES_X86_SSE;
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(4)
-      .channels(4)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, elements_div_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t elements = 8; elements < 32; elements += 4) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, elements_lt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t elements = 1; elements < 4; elements++) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, elements_gt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t elements = 5; elements < 8; elements++) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, channels_lt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 1; channels < 4; channels++) {
-      for (size_t elements = 1; elements < 16; elements += 3) {
-        GAvgPoolSpCHWMicrokernelTester()
-          .elements(elements)
-          .channels(channels)
-          .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-      }
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, channels_gt_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 5; channels < 8; channels++) {
-      for (size_t elements = 1; elements < 16; elements += 3) {
-        GAvgPoolSpCHWMicrokernelTester()
-          .elements(elements)
-          .channels(channels)
-          .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-      }
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, channels_div_4) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t channels = 8; channels <= 16; channels += 4) {
-      for (size_t elements = 1; elements < 16; elements += 3) {
-        GAvgPoolSpCHWMicrokernelTester()
-          .elements(elements)
-          .channels(channels)
-          .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-      }
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, qmin) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t elements = 1; elements < 16; elements += 3) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .qmin(128)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-    }
-  }
-
-  TEST(F32_GAVGPOOL_SPCHW__SSE_X4, qmax) {
-    TEST_REQUIRES_X86_SSE;
-    for (size_t elements = 1; elements < 16; elements += 3) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(4)
-        .qmax(128)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__sse_x4);
-    }
-  }
-#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, elements_eq_4) {
-  GAvgPoolSpCHWMicrokernelTester()
-    .elements(4)
-    .channels(1)
-    .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-}
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, elements_div_4) {
-  for (size_t elements = 8; elements < 32; elements += 4) {
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(elements)
-      .channels(1)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, elements_lt_4) {
-  for (size_t elements = 1; elements < 4; elements++) {
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(elements)
-      .channels(1)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, elements_gt_4) {
-  for (size_t elements = 5; elements < 8; elements++) {
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(elements)
-      .channels(1)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, channels_gt_1) {
-  for (size_t channels = 2; channels < 5; channels++) {
-    for (size_t elements = 1; elements < 16; elements += 3) {
-      GAvgPoolSpCHWMicrokernelTester()
-        .elements(elements)
-        .channels(channels)
-        .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-    }
-  }
-}
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, qmin) {
-  for (size_t elements = 1; elements < 16; elements += 3) {
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(elements)
-      .channels(4)
-      .qmin(128)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-  }
-}
-
-TEST(F32_GAVGPOOL_SPCHW__SCALAR_X1, qmax) {
-  for (size_t elements = 1; elements < 16; elements += 3) {
-    GAvgPoolSpCHWMicrokernelTester()
-      .elements(elements)
-      .channels(4)
-      .qmax(128)
-      .Test(xnn_f32_gavgpool_spchw_ukernel__scalar_x1, GAvgPoolSpCHWMicrokernelTester::Variant::Scalar);
-  }
-}
diff --git a/test/gavgpool-spchw-microkernel-tester.h b/test/gavgpool-cw-microkernel-tester.h
similarity index 88%
rename from test/gavgpool-spchw-microkernel-tester.h
rename to test/gavgpool-cw-microkernel-tester.h
index 113f470..76ce050 100644
--- a/test/gavgpool-spchw-microkernel-tester.h
+++ b/test/gavgpool-cw-microkernel-tester.h
@@ -22,14 +22,14 @@
 #include <xnnpack/params.h>
 
 
-class GAvgPoolSpCHWMicrokernelTester {
+class GAvgPoolCWMicrokernelTester {
  public:
   enum class Variant {
     Native,
     Scalar,
   };
 
-  inline GAvgPoolSpCHWMicrokernelTester& elements(size_t elements) {
+  inline GAvgPoolCWMicrokernelTester& elements(size_t elements) {
     assert(elements != 0);
     this->elements_ = elements;
     return *this;
@@ -39,7 +39,7 @@
     return this->elements_;
   }
 
-  inline GAvgPoolSpCHWMicrokernelTester& channels(size_t channels) {
+  inline GAvgPoolCWMicrokernelTester& channels(size_t channels) {
     assert(channels != 0);
     this->channels_ = channels;
     return *this;
@@ -49,7 +49,7 @@
     return this->channels_;
   }
 
-  inline GAvgPoolSpCHWMicrokernelTester& qmin(uint8_t qmin) {
+  inline GAvgPoolCWMicrokernelTester& qmin(uint8_t qmin) {
     this->qmin_ = qmin;
     return *this;
   }
@@ -58,7 +58,7 @@
     return this->qmin_;
   }
 
-  inline GAvgPoolSpCHWMicrokernelTester& qmax(uint8_t qmax) {
+  inline GAvgPoolCWMicrokernelTester& qmax(uint8_t qmax) {
     this->qmax_ = qmax;
     return *this;
   }
@@ -67,7 +67,7 @@
     return this->qmax_;
   }
 
-  inline GAvgPoolSpCHWMicrokernelTester& iterations(size_t iterations) {
+  inline GAvgPoolCWMicrokernelTester& iterations(size_t iterations) {
     this->iterations_ = iterations;
     return *this;
   }
@@ -77,7 +77,7 @@
   }
 
 
-  void Test(xnn_f32_gavgpool_spchw_ukernel_function gavgpool, Variant variant = Variant::Native) const {
+  void Test(xnn_f32_gavgpool_cw_ukernel_function gavgpool, Variant variant = Variant::Native) const {
     std::random_device random_device;
     auto rng = std::mt19937(random_device());
     auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);