CHW DWCONV with implicit padding

PiperOrigin-RevId: 310369233
diff --git a/test/f32-dwconv-spchw.cc b/test/f32-dwconv-spchw.cc
index c53170f..173c5e0 100644
--- a/test/f32-dwconv-spchw.cc
+++ b/test/f32-dwconv-spchw.cc
@@ -11,7 +11,6 @@
 #include <xnnpack/dwconv.h>
 #include "dwconv-spchw-microkernel-tester.h"
 
-
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_SPCHW_3X3P1__SSE, input_width_eq_4) {
     TEST_REQUIRES_X86_SSE;
@@ -21,6 +20,8 @@
       .input_width(4)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
@@ -36,6 +37,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -52,6 +55,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -68,6 +73,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -85,6 +92,8 @@
         .input_width_stride(36)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -103,6 +112,8 @@
         .input_tuple_stride(3 * 4)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -120,6 +131,8 @@
           .input_width(input_width)
           .padding_left(1)
           .padding_right(1)
+          .padding_top(1)
+          .padding_bottom(1)
           .kernel_height(3)
           .kernel_width(3)
           .output_height(output_height)
@@ -137,6 +150,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(5)
@@ -154,6 +169,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(5)
@@ -173,6 +190,8 @@
         .input_width_stride(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(5)
@@ -183,7 +202,7 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_eq_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_eq_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(4)
@@ -191,6 +210,8 @@
       .input_width(4)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -198,7 +219,7 @@
       .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_lt_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_lt_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 4; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -207,6 +228,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -215,7 +238,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_gt_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_gt_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 5; input_width < 8; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -224,6 +247,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -232,7 +257,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_div_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_div_4_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
       DWConvSpCHWMicrokernelTester()
@@ -241,6 +266,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -249,7 +276,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -259,6 +286,8 @@
         .input_width_stride(36)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -267,7 +296,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_tuple_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
       DWConvSpCHWMicrokernelTester()
@@ -278,6 +307,8 @@
         .input_tuple_stride(3 * 4)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -286,7 +317,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_height_gt_1) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_height_gt_1_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
@@ -296,6 +327,8 @@
           .input_width(input_width)
           .padding_left(1)
           .padding_right(1)
+          .padding_top(0)
+          .padding_bottom(1)
           .kernel_height(3)
           .kernel_width(3)
           .subsampling(2)
@@ -305,7 +338,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_width_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_width_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -314,6 +347,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -323,7 +358,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_tuple_stride_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -332,6 +367,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -342,7 +379,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, chw_layout) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, chw_layout_pad0) {
     TEST_REQUIRES_X86_SSE;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -352,6 +389,206 @@
         .input_width_stride(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride((input_width - 1) / 2 + 1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_eq_4_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(4)
+      .output_tuple_size(4)
+      .input_width(4)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(1)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_lt_4_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 1; input_width < 4; input_width++) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_gt_4_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 5; input_width < 8; input_width++) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_div_4_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 8; input_width < 32; input_width += 4) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_width_stride_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(36)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, input_tuple_stride_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 1; input_width < 32; input_width += 5) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(4)
+        .input_tuple_stride(3 * 4)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_height_gt_1_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t output_height = 2; output_height < 5; output_height++) {
+      for (size_t input_width = 1; input_width < 32; input_width += 3) {
+        DWConvSpCHWMicrokernelTester()
+          .input_tuple_size(4)
+          .output_tuple_size(4)
+          .input_width(input_width)
+          .padding_left(1)
+          .padding_right(1)
+          .padding_top(1)
+          .padding_bottom(1)
+          .kernel_height(3)
+          .kernel_width(3)
+          .subsampling(2)
+          .output_height(output_height)
+          .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_width_stride_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride(36)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, output_tuple_stride_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride(4)
+        .output_tuple_stride(5 * 4)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__SSE, chw_layout_pad1) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -364,7 +601,7 @@
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_eq_4) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_eq_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(4)
@@ -372,13 +609,15 @@
       .input_width(4)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
       .Test(xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_lt_4) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_lt_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -387,6 +626,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -394,7 +635,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_gt_4) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_gt_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -403,6 +644,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -410,7 +653,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_div_4) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_div_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
       DWConvSpCHWMicrokernelTester()
@@ -419,6 +662,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -426,7 +671,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -436,6 +681,8 @@
         .input_width_stride(36)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -443,7 +690,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, input_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
       DWConvSpCHWMicrokernelTester()
@@ -454,6 +701,8 @@
         .input_tuple_stride(3 * 4)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(1)
@@ -461,16 +710,18 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_height_gt_1) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_height_gt_1_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t output_height = 2; output_height < 5; output_height++) {
-      for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    for (size_t output_height = 2; output_height <= 5; output_height++) {
+      for (size_t input_width = 8; input_width < 9; input_width += 3) {
         DWConvSpCHWMicrokernelTester()
           .input_tuple_size(4)
           .output_tuple_size(4)
           .input_width(input_width)
           .padding_left(1)
           .padding_right(1)
+          .padding_top(1)
+          .padding_bottom(1)
           .kernel_height(3)
           .kernel_width(3)
           .output_height(output_height)
@@ -479,7 +730,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_width_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -488,6 +739,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(5)
@@ -496,7 +749,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, output_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -505,6 +758,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(5)
@@ -514,7 +769,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, chw_layout) {
+  TEST(F32_DWCONV_SPCHW_3X3P1__NEONFMA, chw_layout_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -524,6 +779,8 @@
         .input_width_stride(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(5)
@@ -535,7 +792,7 @@
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_eq_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_eq_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(4)
@@ -543,6 +800,8 @@
       .input_width(4)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -550,7 +809,24 @@
       .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_lt_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_eq_4_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(4)
+      .output_tuple_size(4)
+      .input_width(4)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(1)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_lt_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -559,6 +835,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -567,7 +845,26 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_gt_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_lt_4_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 4; input_width++) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_gt_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -576,6 +873,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -584,7 +883,26 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_div_4) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_gt_4_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 5; input_width < 8; input_width++) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_div_4_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
       DWConvSpCHWMicrokernelTester()
@@ -593,6 +911,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -601,7 +921,26 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_div_4_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 8; input_width < 32; input_width += 4) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -611,6 +950,8 @@
         .input_width_stride(36)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -619,7 +960,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_width_stride_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(36)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_tuple_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
       DWConvSpCHWMicrokernelTester()
@@ -630,6 +992,8 @@
         .input_tuple_stride(3 * 4)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -638,7 +1002,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_height_gt_1) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, input_tuple_stride_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 5) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(4)
+        .input_tuple_stride(3 * 4)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_height_gt_1_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 2; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
@@ -648,6 +1033,8 @@
           .input_width(input_width)
           .padding_left(1)
           .padding_right(1)
+          .padding_top(0)
+          .padding_bottom(1)
           .kernel_height(3)
           .kernel_width(3)
           .subsampling(2)
@@ -657,7 +1044,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_width_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_height_gt_1_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t output_height = 2; output_height < 5; output_height++) {
+      for (size_t input_width = 1; input_width < 32; input_width += 3) {
+        DWConvSpCHWMicrokernelTester()
+          .input_tuple_size(4)
+          .output_tuple_size(4)
+          .input_width(input_width)
+          .padding_left(1)
+          .padding_right(1)
+          .padding_top(1)
+          .padding_bottom(1)
+          .kernel_height(3)
+          .kernel_width(3)
+          .subsampling(2)
+          .output_height(output_height)
+          .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_width_stride_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -666,6 +1074,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -675,7 +1085,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -684,6 +1094,28 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride(36)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_tuple_stride_pad0) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -694,7 +1126,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, chw_layout) {
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, output_tuple_stride_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride(4)
+        .output_tuple_stride(5 * 4)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, chw_layout_pad0) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -704,6 +1157,29 @@
         .input_width_stride(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride((input_width - 1) / 2 + 1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_3X3S2P1__NEONFMA, chw_layout_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -716,7 +1192,7 @@
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_eq_4) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_eq_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(4)
@@ -724,13 +1200,15 @@
       .input_width(4)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
       .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_lt_4) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_lt_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 4; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -739,6 +1217,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
@@ -746,7 +1226,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_gt_4) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_gt_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 5; input_width < 8; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -755,6 +1235,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
@@ -762,7 +1244,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_div_4) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_div_4_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 32; input_width += 4) {
       DWConvSpCHWMicrokernelTester()
@@ -771,6 +1253,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
@@ -778,7 +1262,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -788,6 +1272,8 @@
         .input_width_stride(36)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
@@ -795,7 +1281,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, input_tuple_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
       DWConvSpCHWMicrokernelTester()
@@ -806,6 +1292,8 @@
         .input_tuple_stride(3 * 4)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(1)
@@ -813,7 +1301,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_height_eq_2) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_height_eq_2_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -822,6 +1310,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(2)
@@ -829,7 +1319,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_height_gt_2) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_height_gt_2_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 3; output_height < 5; output_height++) {
       for (size_t input_width = 1; input_width < 32; input_width += 3) {
@@ -839,6 +1329,8 @@
           .input_width(input_width)
           .padding_left(2)
           .padding_right(2)
+          .padding_top(2)
+        .padding_bottom(2)
           .kernel_height(5)
           .kernel_width(5)
           .output_height(output_height)
@@ -847,7 +1339,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_width_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -856,6 +1348,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(5)
@@ -864,7 +1358,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, output_tuple_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -873,6 +1367,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(5)
@@ -882,7 +1378,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, chw_layout) {
+  TEST(F32_DWCONV_SPCHW_5X5P2__NEONFMA, chw_layout_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       for (size_t output_height = 1; output_height < 32; output_height += 3) {
@@ -893,6 +1389,8 @@
           .input_width_stride(input_width)
           .padding_left(2)
           .padding_right(2)
+          .padding_top(2)
+          .padding_bottom(2)
           .kernel_height(5)
           .kernel_width(5)
           .output_height(5)
@@ -905,7 +1403,7 @@
 
 
 #if XNN_ARCH_ARM64
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_eq_8) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_eq_8_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(4)
@@ -913,6 +1411,8 @@
       .input_width(8)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .subsampling(2)
@@ -920,7 +1420,24 @@
       .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_lt_8) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_eq_8_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(4)
+      .output_tuple_size(4)
+      .input_width(8)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(1)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .subsampling(2)
+      .output_height(1)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_lt_8_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 8; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -929,6 +1446,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -937,7 +1456,26 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_gt_8) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_lt_8_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 8; input_width++) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_gt_8_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 8; input_width < 16; input_width++) {
       DWConvSpCHWMicrokernelTester()
@@ -946,6 +1484,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -954,7 +1494,26 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_div_4) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_gt_8_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 8; input_width < 16; input_width++) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_div_4_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 16; input_width < 32; input_width += 4) {
       DWConvSpCHWMicrokernelTester()
@@ -963,6 +1522,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -971,7 +1532,26 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_div_4_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 16; input_width < 32; input_width += 4) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -981,6 +1561,8 @@
         .input_width_stride(36)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -989,7 +1571,27 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_width_stride_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(36)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_tuple_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 5) {
       DWConvSpCHWMicrokernelTester()
@@ -1000,6 +1602,8 @@
         .input_tuple_stride(3 * 4)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -1008,7 +1612,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_height_gt_1) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, input_tuple_stride_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 5) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(4)
+        .input_tuple_stride(3 * 4)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_height_gt_1_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t output_height = 3; output_height < 4; output_height++) {
       for (size_t input_width = 4; input_width < 5; input_width += 3) {
@@ -1018,6 +1643,8 @@
           .input_width(input_width)
           .padding_left(2)
           .padding_right(2)
+          .padding_top(1)
+          .padding_bottom(2)
           .kernel_height(5)
           .kernel_width(5)
           .subsampling(2)
@@ -1027,7 +1654,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_width_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_height_gt_1_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t output_height = 3; output_height < 4; output_height++) {
+      for (size_t input_width = 4; input_width < 5; input_width += 3) {
+        DWConvSpCHWMicrokernelTester()
+          .input_tuple_size(4)
+          .output_tuple_size(4)
+          .input_width(input_width)
+          .padding_left(2)
+          .padding_right(2)
+          .padding_top(2)
+          .padding_bottom(2)
+          .kernel_height(5)
+          .kernel_width(5)
+          .subsampling(2)
+          .output_height(output_height)
+          .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_width_stride_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -1036,6 +1684,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -1045,7 +1695,7 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_tuple_stride) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_width_stride_pad2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -1054,6 +1704,28 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride(36)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_tuple_stride_pad1) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -1064,7 +1736,28 @@
     }
   }
 
-  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, chw_layout) {
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, output_tuple_stride_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride(4)
+        .output_tuple_stride(5 * 4)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, chw_layout_pad1) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t input_width = 1; input_width < 32; input_width += 1) {
       DWConvSpCHWMicrokernelTester()
@@ -1074,6 +1767,29 @@
         .input_width_stride(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(1)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .subsampling(2)
+        .output_height(5)
+        .output_width_stride((input_width - 1) / 2 + 1)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_SPCHW_5X5S2P2__NEONFMA, chw_layout_pad2) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t input_width = 1; input_width < 32; input_width += 1) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(4)
+        .output_tuple_size(4)
+        .input_width(input_width)
+        .input_width_stride(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .subsampling(2)
@@ -1084,6 +1800,7 @@
   }
 #endif  // XNN_ARCH_ARM64
 
+
 TEST(F32_DWCONV_SPCHW_3X3P1__SCALAR, input_width_eq_1) {
   DWConvSpCHWMicrokernelTester()
     .input_tuple_size(1)
@@ -1091,6 +1808,8 @@
     .input_width(1)
     .padding_left(1)
     .padding_right(1)
+    .padding_top(1)
+    .padding_bottom(1)
     .kernel_height(3)
     .kernel_width(3)
     .output_height(1)
@@ -1105,6 +1824,8 @@
       .input_width(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
@@ -1121,6 +1842,8 @@
       .input_width_stride(36)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
@@ -1138,6 +1861,8 @@
       .input_tuple_stride(3 * 4)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(1)
@@ -1154,6 +1879,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .output_height(output_height)
@@ -1170,6 +1897,8 @@
       .input_width(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(5)
@@ -1186,6 +1915,8 @@
       .input_width(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(5)
@@ -1204,6 +1935,8 @@
       .input_width_stride(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .output_height(5)
@@ -1212,13 +1945,15 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_eq_1) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_eq_1_pad0) {
   DWConvSpCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
     .padding_left(1)
     .padding_right(1)
+    .padding_top(0)
+    .padding_bottom(1)
     .kernel_height(3)
     .kernel_width(3)
     .subsampling(2)
@@ -1226,7 +1961,7 @@
     .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_gt_1) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_gt_1_pad0) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1234,6 +1969,8 @@
       .input_width(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -1242,7 +1979,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_stride) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1251,6 +1988,8 @@
       .input_width_stride(36)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -1259,7 +1998,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_tuple_stride) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_tuple_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1269,6 +2008,8 @@
       .input_tuple_stride(3 * 4)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -1277,7 +2018,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_height_gt_1) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_height_gt_1_pad0) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -1286,6 +2027,8 @@
         .input_width(input_width)
         .padding_left(1)
         .padding_right(1)
+        .padding_top(0)
+        .padding_bottom(1)
         .kernel_height(3)
         .kernel_width(3)
         .subsampling(2)
@@ -1295,7 +2038,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_width_stride) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_width_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1303,6 +2046,8 @@
       .input_width(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -1312,7 +2057,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_tuple_stride) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_tuple_stride_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1320,6 +2065,8 @@
       .input_width(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -1330,7 +2077,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, chw_layout) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, chw_layout_pad0) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1339,6 +2086,8 @@
       .input_width_stride(input_width)
       .padding_left(1)
       .padding_right(1)
+      .padding_top(0)
+      .padding_bottom(1)
       .kernel_height(3)
       .kernel_width(3)
       .subsampling(2)
@@ -1348,20 +2097,174 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_eq_1) {
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_eq_1_pad1) {
+  DWConvSpCHWMicrokernelTester()
+    .input_tuple_size(1)
+    .output_tuple_size(1)
+    .input_width(1)
+    .padding_left(1)
+    .padding_right(1)
+    .padding_top(1)
+    .padding_bottom(1)
+    .kernel_height(3)
+    .kernel_width(3)
+    .subsampling(2)
+    .output_height(1)
+    .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_gt_1_pad1) {
+  for (size_t input_width = 2; input_width < 32; input_width++) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(1)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_width_stride_pad1) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .input_width_stride(36)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(1)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, input_tuple_stride_pad1) {
+  for (size_t input_width = 1; input_width < 32; input_width += 5) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .input_width_stride(4)
+      .input_tuple_stride(3 * 4)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(1)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_height_gt_1_pad1) {
+  for (size_t output_height = 2; output_height < 5; output_height++) {
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(1)
+        .output_tuple_size(1)
+        .input_width(input_width)
+        .padding_left(1)
+        .padding_right(1)
+        .padding_top(1)
+        .padding_bottom(1)
+        .kernel_height(3)
+        .kernel_width(3)
+        .subsampling(2)
+        .output_height(output_height)
+        .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_width_stride_pad1) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(5)
+      .output_width_stride(36)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, output_tuple_stride_pad1) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(5)
+      .output_width_stride(4)
+      .output_tuple_stride(5 * 4)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_3X3S2P1__SCALAR, chw_layout_pad1) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .input_width_stride(input_width)
+      .padding_left(1)
+      .padding_right(1)
+      .padding_top(1)
+      .padding_bottom(1)
+      .kernel_height(3)
+      .kernel_width(3)
+      .subsampling(2)
+      .output_height(5)
+      .output_width_stride(input_width)
+      .Test(xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_eq_1_pad2) {
   DWConvSpCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
     .padding_left(2)
     .padding_right(2)
+    .padding_top(2)
+    .padding_bottom(2)
     .kernel_height(5)
     .kernel_width(5)
     .output_height(1)
     .Test(xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_gt_1) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_gt_1_pad2) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1369,6 +2272,8 @@
       .input_width(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
@@ -1376,7 +2281,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_stride) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_width_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1385,6 +2290,8 @@
       .input_width_stride(36)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
@@ -1392,7 +2299,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_tuple_stride) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, input_tuple_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1402,6 +2309,8 @@
       .input_tuple_stride(3 * 4)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
@@ -1409,7 +2318,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_height_gt_1) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_height_gt_1_pad2) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -1418,6 +2327,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+      .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(output_height)
@@ -1426,7 +2337,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_width_stride) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_width_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1434,6 +2345,8 @@
       .input_width(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(5)
@@ -1442,7 +2355,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_tuple_stride) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, output_tuple_stride_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1450,6 +2363,8 @@
       .input_width(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(5)
@@ -1459,7 +2374,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, chw_layout) {
+TEST(F32_DWCONV_SPCHW_5X5P2__SCALAR, chw_layout_pad2) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1468,6 +2383,8 @@
       .input_width_stride(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(5)
@@ -1476,13 +2393,15 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_eq_1) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_eq_1_pad1) {
   DWConvSpCHWMicrokernelTester()
     .input_tuple_size(1)
     .output_tuple_size(1)
     .input_width(1)
     .padding_left(2)
     .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
     .kernel_height(5)
     .kernel_width(5)
     .output_height(1)
@@ -1490,7 +2409,7 @@
     .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_gt_1) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_gt_1_pad1) {
   for (size_t input_width = 2; input_width < 32; input_width++) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1498,6 +2417,8 @@
       .input_width(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
@@ -1506,7 +2427,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_stride) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1515,6 +2436,8 @@
       .input_width_stride(36)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
@@ -1523,7 +2446,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_tuple_stride) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_tuple_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 5) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1533,6 +2456,8 @@
       .input_tuple_stride(3 * 4)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(1)
@@ -1541,7 +2466,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_height_gt_1) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_height_gt_1_pad1) {
   for (size_t output_height = 2; output_height < 5; output_height++) {
     for (size_t input_width = 1; input_width < 32; input_width += 3) {
       DWConvSpCHWMicrokernelTester()
@@ -1550,6 +2475,8 @@
         .input_width(input_width)
         .padding_left(2)
         .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
         .kernel_height(5)
         .kernel_width(5)
         .output_height(output_height)
@@ -1559,7 +2486,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_width_stride) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_width_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1567,6 +2494,8 @@
       .input_width(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(5)
@@ -1576,7 +2505,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_tuple_stride) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_tuple_stride_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1584,6 +2513,8 @@
       .input_width(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(5)
@@ -1594,7 +2525,7 @@
   }
 }
 
-TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, chw_layout) {
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, chw_layout_pad1) {
   for (size_t input_width = 1; input_width < 32; input_width += 3) {
     DWConvSpCHWMicrokernelTester()
       .input_tuple_size(1)
@@ -1603,6 +2534,160 @@
       .input_width_stride(input_width)
       .padding_left(2)
       .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .output_height(5)
+      .output_width_stride(input_width)
+      .subsampling(2)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_eq_1_pad2) {
+  DWConvSpCHWMicrokernelTester()
+    .input_tuple_size(1)
+    .output_tuple_size(1)
+    .input_width(1)
+    .padding_left(2)
+    .padding_right(2)
+    .padding_top(2)
+    .padding_bottom(2)
+    .kernel_height(5)
+    .kernel_width(5)
+    .output_height(1)
+    .subsampling(2)
+    .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_gt_1_pad2) {
+  for (size_t input_width = 2; input_width < 32; input_width++) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .output_height(1)
+      .subsampling(2)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_width_stride_pad2) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .input_width_stride(36)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .output_height(1)
+      .subsampling(2)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, input_tuple_stride_pad2) {
+  for (size_t input_width = 1; input_width < 32; input_width += 5) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .input_width_stride(4)
+      .input_tuple_stride(3 * 4)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .output_height(1)
+      .subsampling(2)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_height_gt_1_pad2) {
+  for (size_t output_height = 2; output_height < 5; output_height++) {
+    for (size_t input_width = 1; input_width < 32; input_width += 3) {
+      DWConvSpCHWMicrokernelTester()
+        .input_tuple_size(1)
+        .output_tuple_size(1)
+        .input_width(input_width)
+        .padding_left(2)
+        .padding_right(2)
+        .padding_top(2)
+        .padding_bottom(2)
+        .kernel_height(5)
+        .kernel_width(5)
+        .output_height(output_height)
+        .subsampling(2)
+        .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_width_stride_pad2) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .output_height(5)
+      .output_width_stride(36)
+      .subsampling(2)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, output_tuple_stride_pad2) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
+      .kernel_height(5)
+      .kernel_width(5)
+      .output_height(5)
+      .output_width_stride(4)
+      .output_tuple_stride(5 * 4)
+      .subsampling(2)
+      .Test(xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, DWConvSpCHWMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_SPCHW_5X5S2P2__SCALAR, chw_layout_pad2) {
+  for (size_t input_width = 1; input_width < 32; input_width += 3) {
+    DWConvSpCHWMicrokernelTester()
+      .input_tuple_size(1)
+      .output_tuple_size(1)
+      .input_width(input_width)
+      .input_width_stride(input_width)
+      .padding_left(2)
+      .padding_right(2)
+      .padding_top(2)
+      .padding_bottom(2)
       .kernel_height(5)
       .kernel_width(5)
       .output_height(5)