Refactor DWCONV micro-kernels

- Fix bugs in generation of micro-kernels with large channel tiles
- Add missing unit tests
- Generate, test, and benchmark a microkernels with 2 accumulators, with 2X
  channel tile, and their combinations

PiperOrigin-RevId: 279137161
diff --git a/BUILD.bazel b/BUILD.bazel
index 749fc69..d47b4a3 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -84,8 +84,17 @@
     "src/f32-igemm/4x2-scalar.c",
     "src/f32-igemm/4x4-scalar.c",
     "src/f32-dwconv/up1x25-scalar.c",
+    "src/f32-dwconv/up1x25-scalar-acc2.c",
+    "src/f32-dwconv/up2x25-scalar.c",
+    "src/f32-dwconv/up2x25-scalar-acc2.c",
     "src/f32-dwconv/up1x4-scalar.c",
+    "src/f32-dwconv/up1x4-scalar-acc2.c",
+    "src/f32-dwconv/up2x4-scalar.c",
+    "src/f32-dwconv/up2x4-scalar-acc2.c",
     "src/f32-dwconv/up1x9-scalar.c",
+    "src/f32-dwconv/up1x9-scalar-acc2.c",
+    "src/f32-dwconv/up2x9-scalar.c",
+    "src/f32-dwconv/up2x9-scalar-acc2.c",
     "src/f32-dwconv-spchw/3x3p1-scalar.c",
     "src/f32-dwconv-spchw/3x3s2p1-scalar.c",
     "src/f32-gavgpool-spchw/scalar-x1.c",
@@ -171,8 +180,17 @@
     "src/f32-igemm/6x8-psimd-splat.c",
     "src/f32-igemm/6x8s4-psimd.c",
     "src/f32-dwconv/up4x25-psimd.c",
+    "src/f32-dwconv/up4x25-psimd-acc2.c",
+    "src/f32-dwconv/up8x25-psimd.c",
+    "src/f32-dwconv/up8x25-psimd-acc2.c",
     "src/f32-dwconv/up4x4-psimd.c",
+    "src/f32-dwconv/up4x4-psimd-acc2.c",
+    "src/f32-dwconv/up8x4-psimd.c",
+    "src/f32-dwconv/up8x4-psimd-acc2.c",
     "src/f32-dwconv/up4x9-psimd.c",
+    "src/f32-dwconv/up4x9-psimd-acc2.c",
+    "src/f32-dwconv/up8x9-psimd.c",
+    "src/f32-dwconv/up8x9-psimd-acc2.c",
     "src/f32-gavgpool/mp7p7q-psimd.c",
     "src/f32-gavgpool/up7-psimd.c",
     "src/f32-gemm/1x8-psimd-loadsplat.c",
@@ -220,6 +238,9 @@
     "src/f32-avgpool/up9-neon.c",
     "src/f32-clamp/neon.c",
     "src/f32-dwconv/up4x9-neon.c",
+    "src/f32-dwconv/up4x9-neon-acc2.c",
+    "src/f32-dwconv/up8x9-neon.c",
+    "src/f32-dwconv/up8x9-neon-acc2.c",
     "src/f32-gavgpool-spchw/neon-x4.c",
     "src/f32-gavgpool/mp7p7q-neon.c",
     "src/f32-gavgpool/up7-neon.c",
@@ -283,7 +304,9 @@
     "src/f32-igemm/4x8-neonfma-ld64.c",
     "src/f32-igemm/6x8-neonfma-ld64.c",
     "src/f32-dwconv/up4x9-neonfma.c",
+    "src/f32-dwconv/up4x9-neonfma-acc2.c",
     "src/f32-dwconv/up8x9-neonfma.c",
+    "src/f32-dwconv/up8x9-neonfma-acc2.c",
     "src/f32-gemm/1x8-neonfma-ld64.c",
     "src/f32-gemm/4x2-neonfma-ld64.c",
     "src/f32-gemm/4x8-neonfma-ld128.c",
@@ -350,6 +373,15 @@
     "src/f32-dwconv/up4x25-sse.c",
     "src/f32-dwconv/up4x4-sse.c",
     "src/f32-dwconv/up4x9-sse.c",
+    "src/f32-dwconv/up4x25-sse-acc2.c",
+    "src/f32-dwconv/up4x4-sse-acc2.c",
+    "src/f32-dwconv/up4x9-sse-acc2.c",
+    "src/f32-dwconv/up8x25-sse.c",
+    "src/f32-dwconv/up8x4-sse.c",
+    "src/f32-dwconv/up8x9-sse.c",
+    "src/f32-dwconv/up8x25-sse-acc2.c",
+    "src/f32-dwconv/up8x4-sse-acc2.c",
+    "src/f32-dwconv/up8x9-sse-acc2.c",
     "src/f32-gavgpool-spchw/sse-x4.c",
     "src/f32-gavgpool/mp7p7q-sse.c",
     "src/f32-gavgpool/up7-sse.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef270c2..eb0d190 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -175,6 +175,15 @@
   src/f32-dwconv/up1x25-scalar.c
   src/f32-dwconv/up1x4-scalar.c
   src/f32-dwconv/up1x9-scalar.c
+  src/f32-dwconv/up1x25-scalar-acc2.c
+  src/f32-dwconv/up1x4-scalar-acc2.c
+  src/f32-dwconv/up1x9-scalar-acc2.c
+  src/f32-dwconv/up2x25-scalar.c
+  src/f32-dwconv/up2x4-scalar.c
+  src/f32-dwconv/up2x9-scalar.c
+  src/f32-dwconv/up2x25-scalar-acc2.c
+  src/f32-dwconv/up2x4-scalar-acc2.c
+  src/f32-dwconv/up2x9-scalar-acc2.c
   src/f32-dwconv-spchw/3x3p1-scalar.c
   src/f32-dwconv-spchw/3x3s2p1-scalar.c
   src/f32-gavgpool-spchw/scalar-x1.c
@@ -261,6 +270,15 @@
   src/f32-dwconv/up4x25-psimd.c
   src/f32-dwconv/up4x4-psimd.c
   src/f32-dwconv/up4x9-psimd.c
+  src/f32-dwconv/up4x25-psimd-acc2.c
+  src/f32-dwconv/up4x4-psimd-acc2.c
+  src/f32-dwconv/up4x9-psimd-acc2.c
+  src/f32-dwconv/up8x25-psimd.c
+  src/f32-dwconv/up8x4-psimd.c
+  src/f32-dwconv/up8x9-psimd.c
+  src/f32-dwconv/up8x25-psimd-acc2.c
+  src/f32-dwconv/up8x4-psimd-acc2.c
+  src/f32-dwconv/up8x9-psimd-acc2.c
   src/f32-gavgpool/mp7p7q-psimd.c
   src/f32-gavgpool/up7-psimd.c
   src/f32-gemm/1x8-psimd-loadsplat.c
@@ -312,6 +330,9 @@
   src/f32-igemm/4x8-neon-ld64.c
   src/f32-igemm/6x8-neon-ld64.c
   src/f32-dwconv/up4x9-neon.c
+  src/f32-dwconv/up4x9-neon-acc2.c
+  src/f32-dwconv/up8x9-neon.c
+  src/f32-dwconv/up8x9-neon-acc2.c
   src/f32-gavgpool-spchw/neon-x4.c
   src/f32-gavgpool/mp7p7q-neon.c
   src/f32-gavgpool/up7-neon.c
@@ -368,7 +389,9 @@
   src/f32-igemm/4x8-neonfma-ld64.c
   src/f32-igemm/6x8-neonfma-ld64.c
   src/f32-dwconv/up4x9-neonfma.c
+  src/f32-dwconv/up4x9-neonfma-acc2.c
   src/f32-dwconv/up8x9-neonfma.c
+  src/f32-dwconv/up8x9-neonfma-acc2.c
   src/f32-gemm/1x8-neonfma-ld64.c
   src/f32-gemm/4x2-neonfma-ld64.c
   src/f32-gemm/4x8-neonfma-ld128.c
@@ -432,6 +455,15 @@
   src/f32-dwconv/up4x25-sse.c
   src/f32-dwconv/up4x4-sse.c
   src/f32-dwconv/up4x9-sse.c
+  src/f32-dwconv/up4x25-sse-acc2.c
+  src/f32-dwconv/up4x4-sse-acc2.c
+  src/f32-dwconv/up4x9-sse-acc2.c
+  src/f32-dwconv/up8x25-sse.c
+  src/f32-dwconv/up8x4-sse.c
+  src/f32-dwconv/up8x9-sse.c
+  src/f32-dwconv/up8x25-sse-acc2.c
+  src/f32-dwconv/up8x4-sse-acc2.c
+  src/f32-dwconv/up8x9-sse-acc2.c
   src/f32-gavgpool-spchw/sse-x4.c
   src/f32-gavgpool/mp7p7q-sse.c
   src/f32-gavgpool/up7-sse.c
diff --git a/bench/f32-dwconv-e2e.cc b/bench/f32-dwconv-e2e.cc
index 7b3ca38..65ec4b6 100644
--- a/bench/f32-dwconv-e2e.cc
+++ b/bench/f32-dwconv-e2e.cc
@@ -61,21 +61,6 @@
   state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
 }
 
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__sse)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__sse)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neonfma)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__psimd)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__sse)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neonfma)
-
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
   static void f32_dwconv_up4x9__aarch64_neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
@@ -103,26 +88,71 @@
       4 /* cr */, 9 /* mr */);
   }
 
+  static void f32_dwconv_up4x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up4x9__neon_acc2,
+      4 /* cr */, 9 /* mr */);
+  }
+
+  static void f32_dwconv_up8x9__neon(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__neon,
+      8 /* cr */, 9 /* mr */);
+  }
+
+  static void f32_dwconv_up8x9__neon_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__neon_acc2,
+      8 /* cr */, 9 /* mr */);
+  }
+
   static void f32_dwconv_up4x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up4x9__neonfma,
       4 /* cr */, 9 /* mr */);
   }
 
+  static void f32_dwconv_up4x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2,
+      4 /* cr */, 9 /* mr */);
+  }
+
   static void f32_dwconv_up8x9__neonfma(benchmark::State& state, models::ExecutionPlanFactory model) {
     DWConvEnd2EndBenchmark(state, model,
       xnn_f32_dwconv_ukernel_up8x9__neonfma,
       8 /* cr */, 9 /* mr */);
   }
 
+  static void f32_dwconv_up8x9__neonfma_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2,
+      8 /* cr */, 9 /* mr */);
+  }
+
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__neon, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__neon_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__neon_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__neon, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__neon_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__neon_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__neonfma, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__neonfma, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__neonfma_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__neonfma_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
   BENCHMARK_CAPTURE(f32_dwconv_up8x9__neonfma, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_dwconv_up8x9__neonfma, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__neonfma_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__neonfma_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
@@ -133,8 +163,35 @@
       4 /* cr */, 9 /* mr */);
   }
 
+  static void f32_dwconv_up4x9__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up4x9__sse_acc2,
+      4 /* cr */, 9 /* mr */);
+  }
+
+  static void f32_dwconv_up8x9__sse(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__sse,
+      8 /* cr */, 9 /* mr */);
+  }
+
+  static void f32_dwconv_up8x9__sse_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__sse_acc2,
+      8 /* cr */, 9 /* mr */);
+  }
+
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__sse, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__sse, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__sse_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__sse_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__sse, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__sse, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__sse_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__sse_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 #if !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
@@ -144,8 +201,35 @@
       4 /* cr */, 9 /* mr */);
   }
 
+  static void f32_dwconv_up4x9__psimd_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up4x9__psimd_acc2,
+      4 /* cr */, 9 /* mr */);
+  }
+
+  static void f32_dwconv_up8x9__psimd(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__psimd,
+      8 /* cr */, 9 /* mr */);
+  }
+
+  static void f32_dwconv_up8x9__psimd_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+    DWConvEnd2EndBenchmark(state, model,
+      xnn_f32_dwconv_ukernel_up8x9__psimd_acc2,
+      8 /* cr */, 9 /* mr */);
+  }
+
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__psimd, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_dwconv_up4x9__psimd, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__psimd_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up4x9__psimd_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__psimd, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__psimd, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__psimd_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_dwconv_up8x9__psimd_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 #endif  // !XNN_ARCH_WASM && !XNN_ARCH_ASMJS
 
 static void f32_dwconv_up1x9__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
@@ -154,9 +238,36 @@
       1 /* cr */, 9 /* mr */);
 }
 
+static void f32_dwconv_up1x9__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+  DWConvEnd2EndBenchmark(state, model,
+    xnn_f32_dwconv_ukernel_up1x9__scalar_acc2,
+      1 /* cr */, 9 /* mr */);
+}
+
+static void f32_dwconv_up2x9__scalar(benchmark::State& state, models::ExecutionPlanFactory model) {
+  DWConvEnd2EndBenchmark(state, model,
+    xnn_f32_dwconv_ukernel_up2x9__scalar,
+      2 /* cr */, 9 /* mr */);
+}
+
+static void f32_dwconv_up2x9__scalar_acc2(benchmark::State& state, models::ExecutionPlanFactory model) {
+  DWConvEnd2EndBenchmark(state, model,
+    xnn_f32_dwconv_ukernel_up2x9__scalar_acc2,
+      2 /* cr */, 9 /* mr */);
+}
+
 BENCHMARK_CAPTURE(f32_dwconv_up1x9__scalar, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
 BENCHMARK_CAPTURE(f32_dwconv_up1x9__scalar, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
+BENCHMARK_CAPTURE(f32_dwconv_up1x9__scalar_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK_CAPTURE(f32_dwconv_up1x9__scalar_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+BENCHMARK_CAPTURE(f32_dwconv_up2x9__scalar, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK_CAPTURE(f32_dwconv_up2x9__scalar, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+BENCHMARK_CAPTURE(f32_dwconv_up2x9__scalar_acc2, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+BENCHMARK_CAPTURE(f32_dwconv_up2x9__scalar_acc2, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
 BENCHMARK_MAIN();
 #endif
diff --git a/scripts/generate-f32-dwconv.sh b/scripts/generate-f32-dwconv.sh
index 7f08ef8..66c124f 100755
--- a/scripts/generate-f32-dwconv.sh
+++ b/scripts/generate-f32-dwconv.sh
@@ -5,25 +5,63 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CR=1 -D MR=4  -D AR=2 -o src/f32-dwconv/up1x4-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CR=1 -D MR=9  -D AR=2 -o src/f32-dwconv/up1x9-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CR=1 -D MR=25 -D AR=2 -o src/f32-dwconv/up1x25-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=4  -D ACCUMULATORS=1 -o src/f32-dwconv/up1x4-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=4  -D ACCUMULATORS=2 -o src/f32-dwconv/up1x4-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=4  -D ACCUMULATORS=1 -o src/f32-dwconv/up2x4-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=4  -D ACCUMULATORS=2 -o src/f32-dwconv/up2x4-scalar-acc2.c
+
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=9  -D ACCUMULATORS=1 -o src/f32-dwconv/up1x9-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=9  -D ACCUMULATORS=2 -o src/f32-dwconv/up1x9-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=9  -D ACCUMULATORS=1 -o src/f32-dwconv/up2x9-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=9  -D ACCUMULATORS=2 -o src/f32-dwconv/up2x9-scalar-acc2.c
+
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up1x25-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up1x25-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up2x25-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up2x25-scalar-acc2.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-dwconv/up-neon.c.in -D CR=4 -D MR=9 -D AR=1 -D FMA=0 -o src/f32-dwconv/up4x9-neon.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CR=4 -D MR=9 -D AR=1 -D FMA=1 -o src/f32-dwconv/up4x9-neonfma.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CR=8 -D MR=9 -D AR=1 -D FMA=1 -o src/f32-dwconv/up8x9-neonfma.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-dwconv/up4x9-neon.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-dwconv/up4x9-neon-acc2.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-dwconv/up8x9-neon.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-dwconv/up8x9-neon-acc2.c
+
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-dwconv/up4x9-neonfma.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-dwconv/up4x9-neonfma-acc2.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-dwconv/up8x9-neonfma.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-dwconv/up8x9-neonfma-acc2.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CR=4 -D MR=4 -D AR=2 -o src/f32-dwconv/up4x4-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CR=4 -D MR=9 -D AR=2 -o src/f32-dwconv/up4x9-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CR=4 -D MR=25 -D AR=2 -o src/f32-dwconv/up4x25-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x4-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x4-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x4-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x4-psimd-acc2.c
+
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x9-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x9-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x9-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x9-psimd-acc2.c
+
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x25-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x25-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x25-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x25-psimd-acc2.c
 
 ################################### x86 SSE ###################################
-tools/xngen src/f32-dwconv/up-sse.c.in -D CR=4 -D MR=4 -D AR=2 -o src/f32-dwconv/up4x4-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CR=4 -D MR=9 -D AR=2 -o src/f32-dwconv/up4x9-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CR=4 -D MR=25 -D AR=2 -o src/f32-dwconv/up4x25-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x4-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x4-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x4-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x4-sse-acc2.c
 
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x9-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x9-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x9-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x9-sse-acc2.c
+
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x25-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x25-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x25-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x25-sse-acc2.c
 
 ################################## Unit tests #################################
 tools/generate-dwconv-test.py --spec test/f32-dwconv.yaml --output test/f32-dwconv.cc
diff --git a/src/f32-dwconv/up-neon.c.in b/src/f32-dwconv/up-neon.c.in
index df1ecb7..4481427 100644
--- a/src/f32-dwconv/up-neon.c.in
+++ b/src/f32-dwconv/up-neon.c.in
@@ -3,9 +3,9 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert CR % 4 == 0
-$assert MR >= 2
-$assert AR >= 1
+$assert CHANNEL_TILE % 4 == 0
+$assert KERNEL_TILE >= 2
+$assert ACCUMULATORS >= 1
 $ABC = "0123456789ABCDEF"
 $VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
 #include <assert.h>
@@ -15,7 +15,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_f32_dwconv_ukernel_up${CR}x${MR}__${"neonfma" if FMA else "neon"}(
+void xnn_f32_dwconv_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${"neonfma" if FMA else "neon"}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
     size_t channels,
     size_t output_width,
     const float** input,
@@ -26,83 +26,117 @@
     const union xnn_f32_output_params params[restrict static 1])
 {
   assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
   assert(output_width != 0);
 
   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
   do {
-    $for M in range(MR):
-      const float* i${M} = input[${M}];
+    $for K in range(KERNEL_TILE):
+      const float* i${K} = input[${K}];
     input = (const float**) ((uintptr_t) input + input_stride);
 
     size_t c = channels;
     const float* w = weights;
-    for (; c >= ${CR}; c -= ${CR}) {
-      $for C in range(0, CR, 4):
+    for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
+      $for C in range(0, CHANNEL_TILE, 4):
         float32x4_t vacc${ABC[C:C+4]}p0 = vld1q_f32(w); w += 4;
 
-      $for M in range(MR):
+      $for K in range(KERNEL_TILE):
 
-        $for C in range(0, CR, 4):
-          const float32x4_t vi${M}x${ABC[C:C+4]} = vld1q_f32(i${M}); i${M} += 4;
-        $for C in range(0, CR, 4):
-          const float32x4_t vk${M}x${ABC[C:C+4]} = vld1q_f32(w); w += 4;
-        $for C in range(0, CR, 4):
-          $if 1 <= M < AR:
-            float32x4_t vacc${ABC[C:C+4]}p${M} = vmulq_f32(vi${M}x${ABC[C:C+4]}, vk${M}x${ABC[C:C+4]});
+        $for C in range(0, CHANNEL_TILE, 4):
+          const float32x4_t vi${K}x${ABC[C:C+4]} = vld1q_f32(i${K}); i${K} += 4;
+        $for C in range(0, CHANNEL_TILE, 4):
+          const float32x4_t vk${K}x${ABC[C:C+4]} = vld1q_f32(w); w += 4;
+        $for C in range(0, CHANNEL_TILE, 4):
+          $if 1 <= K < ACCUMULATORS:
+            float32x4_t vacc${ABC[C:C+4]}p${K} = vmulq_f32(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]});
           $else:
-            vacc${ABC[C:C+4]}p${M % AR} = ${VMULADDQ_F32}(vacc${ABC[C:C+4]}p${M % AR}, vi${M}x${ABC[C:C+4]}, vk${M}x${ABC[C:C+4]});
+            vacc${ABC[C:C+4]}p${K % ACCUMULATORS} = ${VMULADDQ_F32}(vacc${ABC[C:C+4]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]});
 
-      $STEPA = 1
-      $while STEPA < AR:
-        $for A in range(0, AR, STEPA * 2):
-          $if A + STEPA < AR:
-            for C in range(0, CR, 4):
-              vacc${ABC[C:C+4]}p${A} = vaddq_f32(vacc${ABC[C:C+4]}p${A}, vacc${ABC[C:C+4]}p${A + STEPA});
-        $STEPA *= 2
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+        $ACC_STEP = 1
+        $while ACC_STEP < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+            $if A + ACC_STEP < ACCUMULATORS:
+              $for C in range(0, CHANNEL_TILE, 4):
+                vacc${ABC[C:C+4]}p${A} = vaddq_f32(vacc${ABC[C:C+4]}p${A}, vacc${ABC[C:C+4]}p${A + ACC_STEP});
+          $ACC_STEP *= 2
 
-      $for C in range(0, CR, 4):
+      $for C in range(0, CHANNEL_TILE, 4):
         float32x4_t vacc${ABC[C:C+4]} = vmaxq_f32(vacc${ABC[C:C+4]}p0, vmin);
-      $for C in range(0, CR, 4):
+      $for C in range(0, CHANNEL_TILE, 4):
         vacc${ABC[C:C+4]} = vminq_f32(vacc${ABC[C:C+4]}, vmax);
 
-      $for C in range(0, CR, 4):
+      $for C in range(0, CHANNEL_TILE, 4):
         vst1q_f32(output, vacc${ABC[C:C+4]}); output += 4;
     }
+    $if CHANNEL_TILE > 4:
+      for (; c >= 4; c -= 4) {
+        float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+        $for K in range(KERNEL_TILE):
+
+          const float32x4_t vi${K}x0123 = vld1q_f32(i${K}); i${K} += 4;
+          const float32x4_t vk${K}x0123 = vld1q_f32(w + ${(K + 1) * CHANNEL_TILE - 4});
+          $if 1 <= K < ACCUMULATORS:
+            float32x4_t vacc0123p${K} = vmulq_f32(vi${K}x0123, vk${K}x0123);
+          $else:
+            vacc0123p${K % ACCUMULATORS} = ${VMULADDQ_F32}(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123);
+
+        $if ACCUMULATORS > 1:
+          // Add up all accumulators to vacc0123p0
+          $ACC_STEP = 1
+          $while ACC_STEP < ACCUMULATORS:
+            $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+              $if A + ACC_STEP < ACCUMULATORS:
+                vacc0123p${A} = vaddq_f32(vacc0123p${A}, vacc0123p${A + ACC_STEP});
+            $ACC_STEP *= 2
+
+        float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+        vacc0123 = vminq_f32(vacc0123, vmax);
+
+        vst1q_f32(output, vacc0123); output += 4;
+      }
     if XNN_UNLIKELY(c != 0) {
-      $for C in range(0, CR, 4):
-        float32x4_t vacc${ABC[C:C+4]} = vld1q_f32(w); w += 4;
+      $if CHANNEL_TILE == 4:
+        float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+      $else:
+        float32x4_t vacc0123p0 = vld1q_f32(w);
 
-      $for M in range(MR):
+      $for K in range(KERNEL_TILE):
 
-        $for C in range(0, CR, 4):
-          const float32x4_t vi${M}x${ABC[C:C+4]} = vld1q_f32(i${M}); i${M} += 4;
-        $for C in range(0, CR, 4):
-          const float32x4_t vk${M}x${ABC[C:C+4]} = vld1q_f32(w); w += 4;
-        $for C in range(0, CR, 4):
-          vacc${ABC[C:C+4]} = ${VMULADDQ_F32}(vacc${ABC[C:C+4]}, vi${M}x${ABC[C:C+4]}, vk${M}x${ABC[C:C+4]});
+        const float32x4_t vi${K}x0123 = vld1q_f32(i${K});
+        $if CHANNEL_TILE == 4:
+          const float32x4_t vk${K}x0123 = vld1q_f32(w); w += 4;
+        $else:
+          const float32x4_t vk${K}x0123 = vld1q_f32(w + ${(K + 1) * CHANNEL_TILE});
+        $if 1 <= K < ACCUMULATORS:
+          float32x4_t vacc0123p${K} = vmulq_f32(vi${K}x0123, vk${K}x0123);
+        $else:
+          vacc0123p${K % ACCUMULATORS} = ${VMULADDQ_F32}(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123);
 
-      $for C in range(0, CR, 4):
-        vacc${ABC[C:C+4]} = vmaxq_f32(vacc${ABC[C:C+4]}, vmin);
-      $for C in range(0, CR, 4):
-        vacc${ABC[C:C+4]} = vminq_f32(vacc${ABC[C:C+4]}, vmax);
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc0123p0
+        $ACC_STEP = 1
+        $while ACC_STEP < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+            $if A + ACC_STEP < ACCUMULATORS:
+              vacc0123p${A} = vaddq_f32(vacc0123p${A}, vacc0123p${A + ACC_STEP});
+          $ACC_STEP *= 2
 
-      $for LOG2C in reversed(range(CR.bit_length())):
-        $if CR != 1 << LOG2C:
-          if (c & ${1 << LOG2C}) {
-            $if LOG2C >= 2:
-              $for C in range(0, 1 << LOG2C, 4):
-                  vst1q_f32(output, vacc${ABC[C:C+4]}); output += 4;
-              $for C in range(0, 1 << (LOG2C - 1), 4):
-                vacc${ABC[C:C+4]} = vacc${ABC[C + (1 << LOG2C):C + (1 << LOG2C)+4]};
-            $elif LOG2C == 1:
-              vst1_f32(output, vacc${ABC[0:2]}); output += 2;
-              vacc${ABC[0:2]} = vget_high_f32(vacc${ABC[0:4]});
-            $elif LOG2C == 0:
-              vst1_lane_f32(output, vacc${ABC[0:2]}, 0); output += 1;
-          }
-        $if LOG2C == 2:
-          float32x2_t vacc${ABC[0:2]} = vget_low_f32(vacc${ABC[0:4]});
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      float32x2_t vacc01 = vget_low_f32(vacc0123);
+      if (c & 2) {
+        vst1_f32(output, vacc01); output += 2;
+        vacc01 = vget_high_f32(vacc0123);
+      }
+      if (c & 1) {
+        vst1_lane_f32(output, vacc01, 0); output += 1;
+      }
     }
 
     output = (float*) ((uintptr_t) output + output_increment);
diff --git a/src/f32-dwconv/up-psimd.c.in b/src/f32-dwconv/up-psimd.c.in
index ed4b8ff..c546581 100644
--- a/src/f32-dwconv/up-psimd.c.in
+++ b/src/f32-dwconv/up-psimd.c.in
@@ -3,9 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert CR == 4
-$assert MR >= 2
-$assert AR >= 1
+$assert CHANNEL_TILE % 4 == 0
+$assert KERNEL_TILE >= 2
+$assert ACCUMULATORS >= 1
+$ABC = "0123456789ABCDEF"
 #include <assert.h>
 
 #include <psimd.h>
@@ -13,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_f32_dwconv_ukernel_up${CR}x${MR}__psimd(
+void xnn_f32_dwconv_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__psimd${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
     size_t channels,
     size_t output_width,
     const float** input,
@@ -29,59 +30,115 @@
   const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
   const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
   do {
-    $for M in range(MR):
-      const float* i${M} = input[${M}];
+    $for K in range(KERNEL_TILE):
+      const float* i${K} = input[${K}];
     input = (const float**) ((uintptr_t) input + input_stride);
 
     size_t c = channels;
     const float* w = weights;
-    for (; c >= 4; c -= 4) {
-      psimd_f32 vacc0 = psimd_load_f32(w);
-      $for M in range(MR):
+    for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
+      psimd_f32 vacc${ABC[0:4]}p0 = psimd_load_f32(w);
+      $for C in range(4, CHANNEL_TILE, 4):
+        psimd_f32 vacc${ABC[C:C+4]}p0 = psimd_load_f32(w + ${C});
 
-        const psimd_f32 vi${M} = psimd_load_f32(i${M});
-        const psimd_f32 vk${M} = psimd_load_f32(w + ${(M+1) * CR});
-        $if 1 <= M < AR:
-          psimd_f32 vacc${M} = psimd_mul_f32(vi${M}, vk${M});
-        $else:
-          vacc${M % AR} = psimd_qfma_f32(vacc${M % AR}, vi${M}, vk${M});
-        i${M} += ${CR};
+      $for K in range(KERNEL_TILE):
 
-      w += ${(MR + 1) * CR};
+        const psimd_f32 vi${K}x${ABC[0:4]} = psimd_load_f32(i${K});
+        $for C in range(4, CHANNEL_TILE, 4):
+          const psimd_f32 vi${K}x${ABC[C:C+4]} = psimd_load_f32(i${K} + ${C});
+        i${K} += ${CHANNEL_TILE};
 
-      $STEPA = 1
-      $while STEPA < AR:
-        $for A in range(0, AR, STEPA * 2):
-          $if A + STEPA < AR:
-            vacc${A} = psimd_add_f32(vacc${A}, vacc${A + STEPA});
-        $STEPA *= 2
+        $for C in range(0, CHANNEL_TILE, 4):
+          const psimd_f32 vk${K}x${ABC[C:C+4]} = psimd_load_f32(w + ${(K + 1) * CHANNEL_TILE + C});
+        $for C in range(0, CHANNEL_TILE, 4):
+          $if 1 <= K < ACCUMULATORS:
+            psimd_f32 vacc${ABC[C:C+4]}p${K} = psimd_mul_f32(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]});
+          $else:
+            vacc${ABC[C:C+4]}p${K % ACCUMULATORS} = psimd_qfma_f32(vacc${ABC[C:C+4]}p${K % ACCUMULATORS}, vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]});
 
-      vacc0 = psimd_max_f32(vacc0, vmin);
-      vacc0 = psimd_min_f32(vacc0, vmax);
+      w += ${(KERNEL_TILE + 1) * CHANNEL_TILE};
 
-      psimd_store_f32(output, vacc0);
-      output += ${CR};
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+        $ACC_SLICE = 1
+        $while ACC_SLICE < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+            $if A + ACC_SLICE < ACCUMULATORS:
+              $for C in range(0, CHANNEL_TILE, 4):
+                vacc${ABC[C:C+4]}p${A} = psimd_add_f32(vacc${ABC[C:C+4]}p${A}, vacc${ABC[C:C+4]}p${A + ACC_SLICE});
+          $ACC_SLICE *= 2
+
+      $for C in range(0, CHANNEL_TILE, 4):
+        psimd_f32 vacc${ABC[C:C+4]} = psimd_max_f32(vacc${ABC[C:C+4]}p0, vmin);
+      $for C in range(0, CHANNEL_TILE, 4):
+        vacc${ABC[C:C+4]} = psimd_min_f32(vacc${ABC[C:C+4]}, vmax);
+
+      psimd_store_f32(output, vacc${ABC[0:4]});
+      $for C in range(4, CHANNEL_TILE, 4):
+        psimd_store_f32(output + ${C}, vacc${ABC[C:C+4]});
+      output += ${CHANNEL_TILE};
     }
+    $if CHANNEL_TILE > 4:
+      for (; c >= 4; c -= 4) {
+        psimd_f32 vacc0123p0 = psimd_load_f32(w);
+        $for K in range(KERNEL_TILE):
+
+          const psimd_f32 vi${K}x0123 = psimd_load_f32(i${K});
+          i${K} += 4;
+
+          const psimd_f32 vk${K}x0123 = psimd_load_f32(w + ${(K + 1) * CHANNEL_TILE});
+          $if 1 <= K < ACCUMULATORS:
+            psimd_f32 vacc0123p${K} = psimd_mul_f32(vi${K}x0123, vk${K}x0123);
+          $else:
+            vacc0123p${K % ACCUMULATORS} = psimd_qfma_f32(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123);
+
+        w += 4;
+
+        $if ACCUMULATORS > 1:
+          // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+          $ACC_SLICE = 1
+          $while ACC_SLICE < ACCUMULATORS:
+            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+              $if A + ACC_SLICE < ACCUMULATORS:
+                vacc0123p${A} = psimd_add_f32(vacc0123p${A}, vacc0123p${A + ACC_SLICE});
+            $ACC_SLICE *= 2
+
+        psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+        vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+        psimd_store_f32(output, vacc0123);
+        output += 4;
+      }
     if XNN_UNLIKELY(c != 0) {
-      psimd_f32 vacc = psimd_load_f32(w);
-      $for M in range(MR):
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      $for K in range(KERNEL_TILE):
 
-        const psimd_f32 vi${M} = psimd_load_f32(i${M});
-        const psimd_f32 vk${M} = psimd_load_f32(w + ${(M+1) * CR});
-        vacc = psimd_qfma_f32(vacc, vi${M}, vk${M});
+        const psimd_f32 vi${K}x0123 = psimd_load_f32(i${K});
+        const psimd_f32 vk${K}x0123 = psimd_load_f32(w + ${(K+1) * CHANNEL_TILE});
+        $if 1 <= K < ACCUMULATORS:
+          psimd_f32 vacc0123p${K} = psimd_mul_f32(vi${K}x0123, vk${K}x0123);
+        $else:
+          vacc0123p${K % ACCUMULATORS} = psimd_qfma_f32(vacc0123p${K % ACCUMULATORS}, vi${K}x0123, vk${K}x0123);
 
-      w += ${(MR + 1) * CR};
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+        $ACC_SLICE = 1
+        $while ACC_SLICE < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+            $if A + ACC_SLICE < ACCUMULATORS:
+              vacc0123p${A} = psimd_add_f32(vacc0123p${A}, vacc0123p${A + ACC_SLICE});
+          $ACC_SLICE *= 2
 
-      vacc = psimd_max_f32(vacc, vmin);
-      vacc = psimd_min_f32(vacc, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
       if (c & 2) {
-        psimd_store2_f32(output, vacc);
-        vacc = psimd_concat_hi_f32(vacc, vacc);
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        psimd_store1_f32(output, vacc);
+        psimd_store1_f32(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up-scalar.c.in b/src/f32-dwconv/up-scalar.c.in
index b05e545..a789bbb 100644
--- a/src/f32-dwconv/up-scalar.c.in
+++ b/src/f32-dwconv/up-scalar.c.in
@@ -3,16 +3,17 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert CR == 1
-$assert MR >= 2
-$assert AR >= 1
+$assert CHANNEL_TILE >= 1
+$assert KERNEL_TILE >= 2
+$assert ACCUMULATORS >= 1
+$ABC = "0123456789ABCDEF"
 #include <assert.h>
 
 #include <xnnpack/dwconv.h>
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_ukernel_up${CR}x${MR}__scalar(
+void xnn_f32_dwconv_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__scalar${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
     size_t channels,
     size_t output_width,
     const float** input,
@@ -28,37 +29,102 @@
   const float vmin = params->scalar.min;
   const float vmax = params->scalar.max;
   do {
-    $for M in range(MR):
-      const float* i${M} = input[${M}];
+    $for K in range(KERNEL_TILE):
+      const float* i${K} = input[${K}];
     input = (const float**) ((uintptr_t) input + input_stride);
 
     size_t c = channels;
     const float* w = weights;
-    do {
-      float vacc0 = w[0];
-      $for M in range(MR):
+    $if CHANNEL_TILE > 1:
+      for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
+        $for C in range(CHANNEL_TILE):
+          float vacc${C}p0 = w[${C}];
 
-        const float vi${M} = *i${M}++;
-        const float vk${M} = w[${M+1}];
-        $if 1 <= M < AR:
-          float vacc${M} = vi${M} * vk${M};
-        $else:
-          vacc${M % AR} += vi${M} * vk${M};
+        $for K in range(KERNEL_TILE):
 
-      w += ${MR + 1};
+          $for C in range(CHANNEL_TILE):
+            const float vi${K}x${C} = i${K}[${C}];
+          i${K} += ${CHANNEL_TILE};
 
-      $STEPA = 1
-      $while STEPA < AR:
-        $for A in range(0, AR, STEPA * 2):
-          $if A + STEPA < AR:
-            vacc${A} += vacc${A + STEPA};
-        $STEPA *= 2
+          $for C in range(CHANNEL_TILE):
+            const float vk${K}x${C} = w[${(K + 1) * CHANNEL_TILE + C}];
+            $if 1 <= K < ACCUMULATORS:
+              float vacc${C}p${K} = vi${K}x${C} * vk${K}x${C};
+            $else:
+              vacc${C}p${K % ACCUMULATORS} += vi${K}x${C} * vk${K}x${C};
 
-      vacc0 = math_max_f32(vacc0, vmin);
-      vacc0 = math_min_f32(vacc0, vmax);
+        w += ${(KERNEL_TILE + 1) * CHANNEL_TILE};
 
-      *output++ = vacc0;
-    } while (--c != 0);
+        $if ACCUMULATORS > 1:
+          // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+          $ACC_SLICE = 1
+          $while ACC_SLICE < ACCUMULATORS:
+            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+              $if A + ACC_SLICE < ACCUMULATORS:
+                $for C in range(CHANNEL_TILE):
+                  vacc${C}p${A} = vacc${C}p${A} + vacc${C}p${A + ACC_SLICE};
+            $ACC_SLICE *= 2
+
+        $for C in range(CHANNEL_TILE):
+          float vacc${C} = math_max_f32(vacc${C}p0, vmin);
+
+        $for C in range(CHANNEL_TILE):
+          vacc${C} = math_min_f32(vacc${C}, vmax);
+
+        $for C in range(CHANNEL_TILE):
+          output[${C}] = vacc${C};
+        output += ${CHANNEL_TILE};
+      }
+      for (; c >= 1; c -= 1) {
+        float vacc0p0 = *w++;
+
+        $for K in range(KERNEL_TILE):
+          const float vi${K} = *i${K}++;
+          const float vk${K} = w[${(K + 1) * CHANNEL_TILE - 1}];
+          $if 1 <= K < ACCUMULATORS:
+            float vacc0p${K} = vi${K} * vk${K};
+          $else:
+            vacc0p${K % ACCUMULATORS} += vi${K} * vk${K};
+
+        $if ACCUMULATORS > 1:
+          // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+          $ACC_SLICE = 1
+          $while ACC_SLICE < ACCUMULATORS:
+            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+              $if A + ACC_SLICE < ACCUMULATORS:
+                vacc0p${A} = vacc0p${A} + vacc0p${A + ACC_SLICE};
+            $ACC_SLICE *= 2
+
+        float vacc0 = math_max_f32(vacc0p0, vmin);
+        vacc0 = math_min_f32(vacc0, vmax);
+        *output++ = vacc0;
+      }
+    $else:
+      do {
+        float vacc0p0 = w[0];
+        $for K in range(KERNEL_TILE):
+
+          const float vi${K} = *i${K}++;
+          const float vk${K} = w[${K+1}];
+          $if 1 <= K < ACCUMULATORS:
+            float vacc0p${K} = vi${K} * vk${K};
+          $else:
+            vacc0p${K % ACCUMULATORS} += vi${K} * vk${K};
+
+        w += ${KERNEL_TILE + 1};
+
+        $ACC_STEP = 1
+        $while ACC_STEP < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_STEP * 2):
+            $if A + ACC_STEP < ACCUMULATORS:
+              vacc0p${A} += vacc0p${A + ACC_STEP};
+          $ACC_STEP *= 2
+
+        float vacc0 = math_max_f32(vacc0p0, vmin);
+        vacc0 = math_min_f32(vacc0, vmax);
+
+        *output++ = vacc0;
+      } while (--c != 0);
 
     output = (float*) ((uintptr_t) output + output_increment);
   } while (--output_width != 0);
diff --git a/src/f32-dwconv/up-sse.c.in b/src/f32-dwconv/up-sse.c.in
index 37a1257..9894be4 100644
--- a/src/f32-dwconv/up-sse.c.in
+++ b/src/f32-dwconv/up-sse.c.in
@@ -3,9 +3,10 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
-$assert CR == 4
-$assert MR >= 2
-$assert AR >= 1
+$assert CHANNEL_TILE % 4 == 0
+$assert KERNEL_TILE >= 2
+$assert ACCUMULATORS >= 1
+$ABC = "0123456789ABCDEF"
 #include <assert.h>
 
 #include <xmmintrin.h>
@@ -13,7 +14,7 @@
 #include <xnnpack/dwconv.h>
 
 
-void xnn_f32_dwconv_ukernel_up${CR}x${MR}__sse(
+void xnn_f32_dwconv_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__sse${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
     size_t channels,
     size_t output_width,
     const float** input,
@@ -29,59 +30,115 @@
   const __m128 vmax = _mm_load_ps(params->sse.max);
   const __m128 vmin = _mm_load_ps(params->sse.min);
   do {
-    $for M in range(MR):
-      const float* i${M} = input[${M}];
+    $for K in range(KERNEL_TILE):
+      const float* i${K} = input[${K}];
     input = (const float**) ((uintptr_t) input + input_stride);
 
     size_t c = channels;
     const float* w = weights;
-    for (; c >= 4; c -= 4) {
-      __m128 vacc0 = _mm_load_ps(w);
-      $for M in range(MR):
+    for (; c >= ${CHANNEL_TILE}; c -= ${CHANNEL_TILE}) {
+      __m128 vacc${ABC[0:4]}p0 = _mm_load_ps(w);
+      $for C in range(4, CHANNEL_TILE, 4):
+        __m128 vacc${ABC[C:C+4]}p0 = _mm_load_ps(w + ${C});
 
-        const __m128 vi${M} = _mm_loadu_ps(i${M});
-        const __m128 vk${M} = _mm_load_ps(w + ${(M+1) * CR});
-        $if 1 <= M < AR:
-          __m128 vacc${M} = _mm_mul_ps(vi${M}, vk${M});
-        $else:
-          vacc${M % AR} = _mm_add_ps(vacc${M % AR}, _mm_mul_ps(vi${M}, vk${M}));
-        i${M} += ${CR};
+      $for K in range(KERNEL_TILE):
 
-      w += ${(MR + 1) * CR};
+        const __m128 vi${K}x${ABC[0:4]} = _mm_loadu_ps(i${K});
+        $for C in range(4, CHANNEL_TILE, 4):
+          const __m128 vi${K}x${ABC[C:C+4]} = _mm_loadu_ps(i${K} + ${C});
+        i${K} += ${CHANNEL_TILE};
 
-      $STEPA = 1
-      $while STEPA < AR:
-        $for A in range(0, AR, STEPA * 2):
-          $if A + STEPA < AR:
-            vacc${A} = _mm_add_ps(vacc${A}, vacc${A + STEPA});
-        $STEPA *= 2
+        $for C in range(0, CHANNEL_TILE, 4):
+          const __m128 vk${K}x${ABC[C:C+4]} = _mm_load_ps(w + ${(K + 1) * CHANNEL_TILE + C});
+        $for C in range(0, CHANNEL_TILE, 4):
+          $if 1 <= K < ACCUMULATORS:
+            __m128 vacc${ABC[C:C+4]}p${K} = _mm_mul_ps(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]});
+          $else:
+            vacc${ABC[C:C+4]}p${K % ACCUMULATORS} = _mm_add_ps(vacc${ABC[C:C+4]}p${K % ACCUMULATORS}, _mm_mul_ps(vi${K}x${ABC[C:C+4]}, vk${K}x${ABC[C:C+4]}));
 
-      vacc0 = _mm_max_ps(vacc0, vmin);
-      vacc0 = _mm_min_ps(vacc0, vmax);
+      w += ${(KERNEL_TILE + 1) * CHANNEL_TILE};
 
-      _mm_storeu_ps(output, vacc0);
-      output += ${CR};
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+        $ACC_SLICE = 1
+        $while ACC_SLICE < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+            $if A + ACC_SLICE < ACCUMULATORS:
+              $for C in range(0, CHANNEL_TILE, 4):
+                vacc${ABC[C:C+4]}p${A} = _mm_add_ps(vacc${ABC[C:C+4]}p${A}, vacc${ABC[C:C+4]}p${A + ACC_SLICE});
+          $ACC_SLICE *= 2
+
+      $for C in range(0, CHANNEL_TILE, 4):
+        __m128 vacc${ABC[C:C+4]} = _mm_max_ps(vacc${ABC[C:C+4]}p0, vmin);
+      $for C in range(0, CHANNEL_TILE, 4):
+        vacc${ABC[C:C+4]} = _mm_min_ps(vacc${ABC[C:C+4]}, vmax);
+
+      _mm_storeu_ps(output, vacc${ABC[0:4]});
+      $for C in range(4, CHANNEL_TILE, 4):
+        _mm_storeu_ps(output + ${C}, vacc${ABC[C:C+4]});
+      output += ${CHANNEL_TILE};
     }
+    $if CHANNEL_TILE > 4:
+      for (; c >= 4; c -= 4) {
+        __m128 vacc0123p0 = _mm_load_ps(w);
+        $for K in range(KERNEL_TILE):
+
+          const __m128 vi${K}x0123 = _mm_loadu_ps(i${K});
+          i${K} += 4;
+
+          const __m128 vk${K}x0123 = _mm_load_ps(w + ${(K + 1) * CHANNEL_TILE});
+          $if 1 <= K < ACCUMULATORS:
+            __m128 vacc0123p${K} = _mm_mul_ps(vi${K}x0123, vk${K}x0123);
+          $else:
+            vacc0123p${K % ACCUMULATORS} = _mm_add_ps(vacc0123p${K % ACCUMULATORS}, _mm_mul_ps(vi${K}x0123, vk${K}x0123));
+
+        w += 4;
+
+        $if ACCUMULATORS > 1:
+          // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+          $ACC_SLICE = 1
+          $while ACC_SLICE < ACCUMULATORS:
+            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+              $if A + ACC_SLICE < ACCUMULATORS:
+                vacc0123p${A} = _mm_add_ps(vacc0123p${A}, vacc0123p${A + ACC_SLICE});
+            $ACC_SLICE *= 2
+
+        __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+        vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+        _mm_storeu_ps(output, vacc0123);
+        output += 4;
+      }
     if XNN_UNLIKELY(c != 0) {
-      __m128 vacc = _mm_load_ps(w);
-      $for M in range(MR):
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      $for K in range(KERNEL_TILE):
 
-        const __m128 vi${M} = _mm_loadu_ps(i${M});
-        const __m128 vk${M} = _mm_load_ps(w + ${(M+1) * CR});
-        vacc = _mm_add_ps(vacc, _mm_mul_ps(vi${M}, vk${M}));
+        const __m128 vi${K}x0123 = _mm_loadu_ps(i${K});
+        const __m128 vk${K}x0123 = _mm_load_ps(w + ${(K + 1) * CHANNEL_TILE});
+        $if 1 <= K < ACCUMULATORS:
+          __m128 vacc0123p${K} = _mm_mul_ps(vi${K}x0123, vk${K}x0123);
+        $else:
+          vacc0123p${K % ACCUMULATORS} = _mm_add_ps(vacc0123p${K % ACCUMULATORS}, _mm_mul_ps(vi${K}x0123, vk${K}x0123));
 
-      w += ${(MR + 1) * CR};
+      $if ACCUMULATORS > 1:
+        // Add up all accumulators to vacc${ABC[0:CHANNEL_TILE]}p0
+        $ACC_SLICE = 1
+        $while ACC_SLICE < ACCUMULATORS:
+          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+            $if A + ACC_SLICE < ACCUMULATORS:
+              vacc0123p${A} = _mm_add_ps(vacc0123p${A}, vacc0123p${A + ACC_SLICE});
+          $ACC_SLICE *= 2
 
-      vacc = _mm_max_ps(vacc, vmin);
-      vacc = _mm_min_ps(vacc, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
       if (c & 2) {
-        _mm_storel_pi((__m64*) output, vacc);
-        vacc = _mm_movehl_ps(vacc, vacc);
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        _mm_store_ss(output, vacc);
+        _mm_store_ss(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up1x25-scalar-acc2.c b/src/f32-dwconv/up1x25-scalar-acc2.c
new file mode 100644
index 0000000..6fce6c0
--- /dev/null
+++ b/src/f32-dwconv/up1x25-scalar-acc2.c
@@ -0,0 +1,176 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x25__scalar_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      float vacc0p1 = vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p1 += vi3 * vk3;
+
+      const float vi4 = *i4++;
+      const float vk4 = w[5];
+      vacc0p0 += vi4 * vk4;
+
+      const float vi5 = *i5++;
+      const float vk5 = w[6];
+      vacc0p1 += vi5 * vk5;
+
+      const float vi6 = *i6++;
+      const float vk6 = w[7];
+      vacc0p0 += vi6 * vk6;
+
+      const float vi7 = *i7++;
+      const float vk7 = w[8];
+      vacc0p1 += vi7 * vk7;
+
+      const float vi8 = *i8++;
+      const float vk8 = w[9];
+      vacc0p0 += vi8 * vk8;
+
+      const float vi9 = *i9++;
+      const float vk9 = w[10];
+      vacc0p1 += vi9 * vk9;
+
+      const float vi10 = *i10++;
+      const float vk10 = w[11];
+      vacc0p0 += vi10 * vk10;
+
+      const float vi11 = *i11++;
+      const float vk11 = w[12];
+      vacc0p1 += vi11 * vk11;
+
+      const float vi12 = *i12++;
+      const float vk12 = w[13];
+      vacc0p0 += vi12 * vk12;
+
+      const float vi13 = *i13++;
+      const float vk13 = w[14];
+      vacc0p1 += vi13 * vk13;
+
+      const float vi14 = *i14++;
+      const float vk14 = w[15];
+      vacc0p0 += vi14 * vk14;
+
+      const float vi15 = *i15++;
+      const float vk15 = w[16];
+      vacc0p1 += vi15 * vk15;
+
+      const float vi16 = *i16++;
+      const float vk16 = w[17];
+      vacc0p0 += vi16 * vk16;
+
+      const float vi17 = *i17++;
+      const float vk17 = w[18];
+      vacc0p1 += vi17 * vk17;
+
+      const float vi18 = *i18++;
+      const float vk18 = w[19];
+      vacc0p0 += vi18 * vk18;
+
+      const float vi19 = *i19++;
+      const float vk19 = w[20];
+      vacc0p1 += vi19 * vk19;
+
+      const float vi20 = *i20++;
+      const float vk20 = w[21];
+      vacc0p0 += vi20 * vk20;
+
+      const float vi21 = *i21++;
+      const float vk21 = w[22];
+      vacc0p1 += vi21 * vk21;
+
+      const float vi22 = *i22++;
+      const float vk22 = w[23];
+      vacc0p0 += vi22 * vk22;
+
+      const float vi23 = *i23++;
+      const float vk23 = w[24];
+      vacc0p1 += vi23 * vk23;
+
+      const float vi24 = *i24++;
+      const float vk24 = w[25];
+      vacc0p0 += vi24 * vk24;
+
+      w += 26;
+
+      vacc0p0 += vacc0p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up1x25-scalar.c b/src/f32-dwconv/up1x25-scalar.c
index c0912af..e0ab4a0 100644
--- a/src/f32-dwconv/up1x25-scalar.c
+++ b/src/f32-dwconv/up1x25-scalar.c
@@ -59,113 +59,112 @@
     size_t c = channels;
     const float* w = weights;
     do {
-      float vacc0 = w[0];
+      float vacc0p0 = w[0];
 
       const float vi0 = *i0++;
       const float vk0 = w[1];
-      vacc0 += vi0 * vk0;
+      vacc0p0 += vi0 * vk0;
 
       const float vi1 = *i1++;
       const float vk1 = w[2];
-      float vacc1 = vi1 * vk1;
+      vacc0p0 += vi1 * vk1;
 
       const float vi2 = *i2++;
       const float vk2 = w[3];
-      vacc0 += vi2 * vk2;
+      vacc0p0 += vi2 * vk2;
 
       const float vi3 = *i3++;
       const float vk3 = w[4];
-      vacc1 += vi3 * vk3;
+      vacc0p0 += vi3 * vk3;
 
       const float vi4 = *i4++;
       const float vk4 = w[5];
-      vacc0 += vi4 * vk4;
+      vacc0p0 += vi4 * vk4;
 
       const float vi5 = *i5++;
       const float vk5 = w[6];
-      vacc1 += vi5 * vk5;
+      vacc0p0 += vi5 * vk5;
 
       const float vi6 = *i6++;
       const float vk6 = w[7];
-      vacc0 += vi6 * vk6;
+      vacc0p0 += vi6 * vk6;
 
       const float vi7 = *i7++;
       const float vk7 = w[8];
-      vacc1 += vi7 * vk7;
+      vacc0p0 += vi7 * vk7;
 
       const float vi8 = *i8++;
       const float vk8 = w[9];
-      vacc0 += vi8 * vk8;
+      vacc0p0 += vi8 * vk8;
 
       const float vi9 = *i9++;
       const float vk9 = w[10];
-      vacc1 += vi9 * vk9;
+      vacc0p0 += vi9 * vk9;
 
       const float vi10 = *i10++;
       const float vk10 = w[11];
-      vacc0 += vi10 * vk10;
+      vacc0p0 += vi10 * vk10;
 
       const float vi11 = *i11++;
       const float vk11 = w[12];
-      vacc1 += vi11 * vk11;
+      vacc0p0 += vi11 * vk11;
 
       const float vi12 = *i12++;
       const float vk12 = w[13];
-      vacc0 += vi12 * vk12;
+      vacc0p0 += vi12 * vk12;
 
       const float vi13 = *i13++;
       const float vk13 = w[14];
-      vacc1 += vi13 * vk13;
+      vacc0p0 += vi13 * vk13;
 
       const float vi14 = *i14++;
       const float vk14 = w[15];
-      vacc0 += vi14 * vk14;
+      vacc0p0 += vi14 * vk14;
 
       const float vi15 = *i15++;
       const float vk15 = w[16];
-      vacc1 += vi15 * vk15;
+      vacc0p0 += vi15 * vk15;
 
       const float vi16 = *i16++;
       const float vk16 = w[17];
-      vacc0 += vi16 * vk16;
+      vacc0p0 += vi16 * vk16;
 
       const float vi17 = *i17++;
       const float vk17 = w[18];
-      vacc1 += vi17 * vk17;
+      vacc0p0 += vi17 * vk17;
 
       const float vi18 = *i18++;
       const float vk18 = w[19];
-      vacc0 += vi18 * vk18;
+      vacc0p0 += vi18 * vk18;
 
       const float vi19 = *i19++;
       const float vk19 = w[20];
-      vacc1 += vi19 * vk19;
+      vacc0p0 += vi19 * vk19;
 
       const float vi20 = *i20++;
       const float vk20 = w[21];
-      vacc0 += vi20 * vk20;
+      vacc0p0 += vi20 * vk20;
 
       const float vi21 = *i21++;
       const float vk21 = w[22];
-      vacc1 += vi21 * vk21;
+      vacc0p0 += vi21 * vk21;
 
       const float vi22 = *i22++;
       const float vk22 = w[23];
-      vacc0 += vi22 * vk22;
+      vacc0p0 += vi22 * vk22;
 
       const float vi23 = *i23++;
       const float vk23 = w[24];
-      vacc1 += vi23 * vk23;
+      vacc0p0 += vi23 * vk23;
 
       const float vi24 = *i24++;
       const float vk24 = w[25];
-      vacc0 += vi24 * vk24;
+      vacc0p0 += vi24 * vk24;
 
       w += 26;
 
-      vacc0 += vacc1;
 
-      vacc0 = math_max_f32(vacc0, vmin);
+      float vacc0 = math_max_f32(vacc0p0, vmin);
       vacc0 = math_min_f32(vacc0, vmax);
 
       *output++ = vacc0;
diff --git a/src/f32-dwconv/up1x4-scalar-acc2.c b/src/f32-dwconv/up1x4-scalar-acc2.c
new file mode 100644
index 0000000..577f99d
--- /dev/null
+++ b/src/f32-dwconv/up1x4-scalar-acc2.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x4__scalar_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      float vacc0p1 = vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p1 += vi3 * vk3;
+
+      w += 5;
+
+      vacc0p0 += vacc0p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up1x4-scalar.c b/src/f32-dwconv/up1x4-scalar.c
index 6f89579..7eafad9 100644
--- a/src/f32-dwconv/up1x4-scalar.c
+++ b/src/f32-dwconv/up1x4-scalar.c
@@ -38,29 +38,28 @@
     size_t c = channels;
     const float* w = weights;
     do {
-      float vacc0 = w[0];
+      float vacc0p0 = w[0];
 
       const float vi0 = *i0++;
       const float vk0 = w[1];
-      vacc0 += vi0 * vk0;
+      vacc0p0 += vi0 * vk0;
 
       const float vi1 = *i1++;
       const float vk1 = w[2];
-      float vacc1 = vi1 * vk1;
+      vacc0p0 += vi1 * vk1;
 
       const float vi2 = *i2++;
       const float vk2 = w[3];
-      vacc0 += vi2 * vk2;
+      vacc0p0 += vi2 * vk2;
 
       const float vi3 = *i3++;
       const float vk3 = w[4];
-      vacc1 += vi3 * vk3;
+      vacc0p0 += vi3 * vk3;
 
       w += 5;
 
-      vacc0 += vacc1;
 
-      vacc0 = math_max_f32(vacc0, vmin);
+      float vacc0 = math_max_f32(vacc0p0, vmin);
       vacc0 = math_min_f32(vacc0, vmax);
 
       *output++ = vacc0;
diff --git a/src/f32-dwconv/up1x9-scalar-acc2.c b/src/f32-dwconv/up1x9-scalar-acc2.c
new file mode 100644
index 0000000..d7d660b
--- /dev/null
+++ b/src/f32-dwconv/up1x9-scalar-acc2.c
@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x9__scalar_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      float vacc0p1 = vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p1 += vi3 * vk3;
+
+      const float vi4 = *i4++;
+      const float vk4 = w[5];
+      vacc0p0 += vi4 * vk4;
+
+      const float vi5 = *i5++;
+      const float vk5 = w[6];
+      vacc0p1 += vi5 * vk5;
+
+      const float vi6 = *i6++;
+      const float vk6 = w[7];
+      vacc0p0 += vi6 * vk6;
+
+      const float vi7 = *i7++;
+      const float vk7 = w[8];
+      vacc0p1 += vi7 * vk7;
+
+      const float vi8 = *i8++;
+      const float vk8 = w[9];
+      vacc0p0 += vi8 * vk8;
+
+      w += 10;
+
+      vacc0p0 += vacc0p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up1x9-scalar.c b/src/f32-dwconv/up1x9-scalar.c
index 69b10c1..82f4a90 100644
--- a/src/f32-dwconv/up1x9-scalar.c
+++ b/src/f32-dwconv/up1x9-scalar.c
@@ -43,49 +43,48 @@
     size_t c = channels;
     const float* w = weights;
     do {
-      float vacc0 = w[0];
+      float vacc0p0 = w[0];
 
       const float vi0 = *i0++;
       const float vk0 = w[1];
-      vacc0 += vi0 * vk0;
+      vacc0p0 += vi0 * vk0;
 
       const float vi1 = *i1++;
       const float vk1 = w[2];
-      float vacc1 = vi1 * vk1;
+      vacc0p0 += vi1 * vk1;
 
       const float vi2 = *i2++;
       const float vk2 = w[3];
-      vacc0 += vi2 * vk2;
+      vacc0p0 += vi2 * vk2;
 
       const float vi3 = *i3++;
       const float vk3 = w[4];
-      vacc1 += vi3 * vk3;
+      vacc0p0 += vi3 * vk3;
 
       const float vi4 = *i4++;
       const float vk4 = w[5];
-      vacc0 += vi4 * vk4;
+      vacc0p0 += vi4 * vk4;
 
       const float vi5 = *i5++;
       const float vk5 = w[6];
-      vacc1 += vi5 * vk5;
+      vacc0p0 += vi5 * vk5;
 
       const float vi6 = *i6++;
       const float vk6 = w[7];
-      vacc0 += vi6 * vk6;
+      vacc0p0 += vi6 * vk6;
 
       const float vi7 = *i7++;
       const float vk7 = w[8];
-      vacc1 += vi7 * vk7;
+      vacc0p0 += vi7 * vk7;
 
       const float vi8 = *i8++;
       const float vk8 = w[9];
-      vacc0 += vi8 * vk8;
+      vacc0p0 += vi8 * vk8;
 
       w += 10;
 
-      vacc0 += vacc1;
 
-      vacc0 = math_max_f32(vacc0, vmin);
+      float vacc0 = math_max_f32(vacc0p0, vmin);
       vacc0 = math_min_f32(vacc0, vmax);
 
       *output++ = vacc0;
diff --git a/src/f32-dwconv/up2x25-scalar-acc2.c b/src/f32-dwconv/up2x25-scalar-acc2.c
new file mode 100644
index 0000000..7f13945
--- /dev/null
+++ b/src/f32-dwconv/up2x25-scalar-acc2.c
@@ -0,0 +1,396 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x25__scalar_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      float vacc0p1 = vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      float vacc1p1 = vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p1 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p1 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p1 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p1 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p1 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p1 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      const float vi9x0 = i9[0];
+      const float vi9x1 = i9[1];
+      i9 += 2;
+
+      const float vk9x0 = w[20];
+      vacc0p1 += vi9x0 * vk9x0;
+      const float vk9x1 = w[21];
+      vacc1p1 += vi9x1 * vk9x1;
+
+      const float vi10x0 = i10[0];
+      const float vi10x1 = i10[1];
+      i10 += 2;
+
+      const float vk10x0 = w[22];
+      vacc0p0 += vi10x0 * vk10x0;
+      const float vk10x1 = w[23];
+      vacc1p0 += vi10x1 * vk10x1;
+
+      const float vi11x0 = i11[0];
+      const float vi11x1 = i11[1];
+      i11 += 2;
+
+      const float vk11x0 = w[24];
+      vacc0p1 += vi11x0 * vk11x0;
+      const float vk11x1 = w[25];
+      vacc1p1 += vi11x1 * vk11x1;
+
+      const float vi12x0 = i12[0];
+      const float vi12x1 = i12[1];
+      i12 += 2;
+
+      const float vk12x0 = w[26];
+      vacc0p0 += vi12x0 * vk12x0;
+      const float vk12x1 = w[27];
+      vacc1p0 += vi12x1 * vk12x1;
+
+      const float vi13x0 = i13[0];
+      const float vi13x1 = i13[1];
+      i13 += 2;
+
+      const float vk13x0 = w[28];
+      vacc0p1 += vi13x0 * vk13x0;
+      const float vk13x1 = w[29];
+      vacc1p1 += vi13x1 * vk13x1;
+
+      const float vi14x0 = i14[0];
+      const float vi14x1 = i14[1];
+      i14 += 2;
+
+      const float vk14x0 = w[30];
+      vacc0p0 += vi14x0 * vk14x0;
+      const float vk14x1 = w[31];
+      vacc1p0 += vi14x1 * vk14x1;
+
+      const float vi15x0 = i15[0];
+      const float vi15x1 = i15[1];
+      i15 += 2;
+
+      const float vk15x0 = w[32];
+      vacc0p1 += vi15x0 * vk15x0;
+      const float vk15x1 = w[33];
+      vacc1p1 += vi15x1 * vk15x1;
+
+      const float vi16x0 = i16[0];
+      const float vi16x1 = i16[1];
+      i16 += 2;
+
+      const float vk16x0 = w[34];
+      vacc0p0 += vi16x0 * vk16x0;
+      const float vk16x1 = w[35];
+      vacc1p0 += vi16x1 * vk16x1;
+
+      const float vi17x0 = i17[0];
+      const float vi17x1 = i17[1];
+      i17 += 2;
+
+      const float vk17x0 = w[36];
+      vacc0p1 += vi17x0 * vk17x0;
+      const float vk17x1 = w[37];
+      vacc1p1 += vi17x1 * vk17x1;
+
+      const float vi18x0 = i18[0];
+      const float vi18x1 = i18[1];
+      i18 += 2;
+
+      const float vk18x0 = w[38];
+      vacc0p0 += vi18x0 * vk18x0;
+      const float vk18x1 = w[39];
+      vacc1p0 += vi18x1 * vk18x1;
+
+      const float vi19x0 = i19[0];
+      const float vi19x1 = i19[1];
+      i19 += 2;
+
+      const float vk19x0 = w[40];
+      vacc0p1 += vi19x0 * vk19x0;
+      const float vk19x1 = w[41];
+      vacc1p1 += vi19x1 * vk19x1;
+
+      const float vi20x0 = i20[0];
+      const float vi20x1 = i20[1];
+      i20 += 2;
+
+      const float vk20x0 = w[42];
+      vacc0p0 += vi20x0 * vk20x0;
+      const float vk20x1 = w[43];
+      vacc1p0 += vi20x1 * vk20x1;
+
+      const float vi21x0 = i21[0];
+      const float vi21x1 = i21[1];
+      i21 += 2;
+
+      const float vk21x0 = w[44];
+      vacc0p1 += vi21x0 * vk21x0;
+      const float vk21x1 = w[45];
+      vacc1p1 += vi21x1 * vk21x1;
+
+      const float vi22x0 = i22[0];
+      const float vi22x1 = i22[1];
+      i22 += 2;
+
+      const float vk22x0 = w[46];
+      vacc0p0 += vi22x0 * vk22x0;
+      const float vk22x1 = w[47];
+      vacc1p0 += vi22x1 * vk22x1;
+
+      const float vi23x0 = i23[0];
+      const float vi23x1 = i23[1];
+      i23 += 2;
+
+      const float vk23x0 = w[48];
+      vacc0p1 += vi23x0 * vk23x0;
+      const float vk23x1 = w[49];
+      vacc1p1 += vi23x1 * vk23x1;
+
+      const float vi24x0 = i24[0];
+      const float vi24x1 = i24[1];
+      i24 += 2;
+
+      const float vk24x0 = w[50];
+      vacc0p0 += vi24x0 * vk24x0;
+      const float vk24x1 = w[51];
+      vacc1p0 += vi24x1 * vk24x1;
+
+      w += 52;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+      vacc1p0 = vacc1p0 + vacc1p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      float vacc1 = math_max_f32(vacc1p0, vmin);
+
+      vacc0 = math_min_f32(vacc0, vmax);
+      vacc1 = math_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      float vacc0p1 = vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p1 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p1 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p1 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+      const float vi9 = *i9++;
+      const float vk9 = w[19];
+      vacc0p1 += vi9 * vk9;
+      const float vi10 = *i10++;
+      const float vk10 = w[21];
+      vacc0p0 += vi10 * vk10;
+      const float vi11 = *i11++;
+      const float vk11 = w[23];
+      vacc0p1 += vi11 * vk11;
+      const float vi12 = *i12++;
+      const float vk12 = w[25];
+      vacc0p0 += vi12 * vk12;
+      const float vi13 = *i13++;
+      const float vk13 = w[27];
+      vacc0p1 += vi13 * vk13;
+      const float vi14 = *i14++;
+      const float vk14 = w[29];
+      vacc0p0 += vi14 * vk14;
+      const float vi15 = *i15++;
+      const float vk15 = w[31];
+      vacc0p1 += vi15 * vk15;
+      const float vi16 = *i16++;
+      const float vk16 = w[33];
+      vacc0p0 += vi16 * vk16;
+      const float vi17 = *i17++;
+      const float vk17 = w[35];
+      vacc0p1 += vi17 * vk17;
+      const float vi18 = *i18++;
+      const float vk18 = w[37];
+      vacc0p0 += vi18 * vk18;
+      const float vi19 = *i19++;
+      const float vk19 = w[39];
+      vacc0p1 += vi19 * vk19;
+      const float vi20 = *i20++;
+      const float vk20 = w[41];
+      vacc0p0 += vi20 * vk20;
+      const float vi21 = *i21++;
+      const float vk21 = w[43];
+      vacc0p1 += vi21 * vk21;
+      const float vi22 = *i22++;
+      const float vk22 = w[45];
+      vacc0p0 += vi22 * vk22;
+      const float vi23 = *i23++;
+      const float vk23 = w[47];
+      vacc0p1 += vi23 * vk23;
+      const float vi24 = *i24++;
+      const float vk24 = w[49];
+      vacc0p0 += vi24 * vk24;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up2x25-scalar.c b/src/f32-dwconv/up2x25-scalar.c
new file mode 100644
index 0000000..57b5e47
--- /dev/null
+++ b/src/f32-dwconv/up2x25-scalar.c
@@ -0,0 +1,391 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x25__scalar(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      vacc0p0 += vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      vacc1p0 += vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p0 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p0 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p0 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p0 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p0 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p0 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      const float vi9x0 = i9[0];
+      const float vi9x1 = i9[1];
+      i9 += 2;
+
+      const float vk9x0 = w[20];
+      vacc0p0 += vi9x0 * vk9x0;
+      const float vk9x1 = w[21];
+      vacc1p0 += vi9x1 * vk9x1;
+
+      const float vi10x0 = i10[0];
+      const float vi10x1 = i10[1];
+      i10 += 2;
+
+      const float vk10x0 = w[22];
+      vacc0p0 += vi10x0 * vk10x0;
+      const float vk10x1 = w[23];
+      vacc1p0 += vi10x1 * vk10x1;
+
+      const float vi11x0 = i11[0];
+      const float vi11x1 = i11[1];
+      i11 += 2;
+
+      const float vk11x0 = w[24];
+      vacc0p0 += vi11x0 * vk11x0;
+      const float vk11x1 = w[25];
+      vacc1p0 += vi11x1 * vk11x1;
+
+      const float vi12x0 = i12[0];
+      const float vi12x1 = i12[1];
+      i12 += 2;
+
+      const float vk12x0 = w[26];
+      vacc0p0 += vi12x0 * vk12x0;
+      const float vk12x1 = w[27];
+      vacc1p0 += vi12x1 * vk12x1;
+
+      const float vi13x0 = i13[0];
+      const float vi13x1 = i13[1];
+      i13 += 2;
+
+      const float vk13x0 = w[28];
+      vacc0p0 += vi13x0 * vk13x0;
+      const float vk13x1 = w[29];
+      vacc1p0 += vi13x1 * vk13x1;
+
+      const float vi14x0 = i14[0];
+      const float vi14x1 = i14[1];
+      i14 += 2;
+
+      const float vk14x0 = w[30];
+      vacc0p0 += vi14x0 * vk14x0;
+      const float vk14x1 = w[31];
+      vacc1p0 += vi14x1 * vk14x1;
+
+      const float vi15x0 = i15[0];
+      const float vi15x1 = i15[1];
+      i15 += 2;
+
+      const float vk15x0 = w[32];
+      vacc0p0 += vi15x0 * vk15x0;
+      const float vk15x1 = w[33];
+      vacc1p0 += vi15x1 * vk15x1;
+
+      const float vi16x0 = i16[0];
+      const float vi16x1 = i16[1];
+      i16 += 2;
+
+      const float vk16x0 = w[34];
+      vacc0p0 += vi16x0 * vk16x0;
+      const float vk16x1 = w[35];
+      vacc1p0 += vi16x1 * vk16x1;
+
+      const float vi17x0 = i17[0];
+      const float vi17x1 = i17[1];
+      i17 += 2;
+
+      const float vk17x0 = w[36];
+      vacc0p0 += vi17x0 * vk17x0;
+      const float vk17x1 = w[37];
+      vacc1p0 += vi17x1 * vk17x1;
+
+      const float vi18x0 = i18[0];
+      const float vi18x1 = i18[1];
+      i18 += 2;
+
+      const float vk18x0 = w[38];
+      vacc0p0 += vi18x0 * vk18x0;
+      const float vk18x1 = w[39];
+      vacc1p0 += vi18x1 * vk18x1;
+
+      const float vi19x0 = i19[0];
+      const float vi19x1 = i19[1];
+      i19 += 2;
+
+      const float vk19x0 = w[40];
+      vacc0p0 += vi19x0 * vk19x0;
+      const float vk19x1 = w[41];
+      vacc1p0 += vi19x1 * vk19x1;
+
+      const float vi20x0 = i20[0];
+      const float vi20x1 = i20[1];
+      i20 += 2;
+
+      const float vk20x0 = w[42];
+      vacc0p0 += vi20x0 * vk20x0;
+      const float vk20x1 = w[43];
+      vacc1p0 += vi20x1 * vk20x1;
+
+      const float vi21x0 = i21[0];
+      const float vi21x1 = i21[1];
+      i21 += 2;
+
+      const float vk21x0 = w[44];
+      vacc0p0 += vi21x0 * vk21x0;
+      const float vk21x1 = w[45];
+      vacc1p0 += vi21x1 * vk21x1;
+
+      const float vi22x0 = i22[0];
+      const float vi22x1 = i22[1];
+      i22 += 2;
+
+      const float vk22x0 = w[46];
+      vacc0p0 += vi22x0 * vk22x0;
+      const float vk22x1 = w[47];
+      vacc1p0 += vi22x1 * vk22x1;
+
+      const float vi23x0 = i23[0];
+      const float vi23x1 = i23[1];
+      i23 += 2;
+
+      const float vk23x0 = w[48];
+      vacc0p0 += vi23x0 * vk23x0;
+      const float vk23x1 = w[49];
+      vacc1p0 += vi23x1 * vk23x1;
+
+      const float vi24x0 = i24[0];
+      const float vi24x1 = i24[1];
+      i24 += 2;
+
+      const float vk24x0 = w[50];
+      vacc0p0 += vi24x0 * vk24x0;
+      const float vk24x1 = w[51];
+      vacc1p0 += vi24x1 * vk24x1;
+
+      w += 52;
+
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      float vacc1 = math_max_f32(vacc1p0, vmin);
+
+      vacc0 = math_min_f32(vacc0, vmax);
+      vacc1 = math_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      vacc0p0 += vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p0 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p0 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p0 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+      const float vi9 = *i9++;
+      const float vk9 = w[19];
+      vacc0p0 += vi9 * vk9;
+      const float vi10 = *i10++;
+      const float vk10 = w[21];
+      vacc0p0 += vi10 * vk10;
+      const float vi11 = *i11++;
+      const float vk11 = w[23];
+      vacc0p0 += vi11 * vk11;
+      const float vi12 = *i12++;
+      const float vk12 = w[25];
+      vacc0p0 += vi12 * vk12;
+      const float vi13 = *i13++;
+      const float vk13 = w[27];
+      vacc0p0 += vi13 * vk13;
+      const float vi14 = *i14++;
+      const float vk14 = w[29];
+      vacc0p0 += vi14 * vk14;
+      const float vi15 = *i15++;
+      const float vk15 = w[31];
+      vacc0p0 += vi15 * vk15;
+      const float vi16 = *i16++;
+      const float vk16 = w[33];
+      vacc0p0 += vi16 * vk16;
+      const float vi17 = *i17++;
+      const float vk17 = w[35];
+      vacc0p0 += vi17 * vk17;
+      const float vi18 = *i18++;
+      const float vk18 = w[37];
+      vacc0p0 += vi18 * vk18;
+      const float vi19 = *i19++;
+      const float vk19 = w[39];
+      vacc0p0 += vi19 * vk19;
+      const float vi20 = *i20++;
+      const float vk20 = w[41];
+      vacc0p0 += vi20 * vk20;
+      const float vi21 = *i21++;
+      const float vk21 = w[43];
+      vacc0p0 += vi21 * vk21;
+      const float vi22 = *i22++;
+      const float vk22 = w[45];
+      vacc0p0 += vi22 * vk22;
+      const float vi23 = *i23++;
+      const float vk23 = w[47];
+      vacc0p0 += vi23 * vk23;
+      const float vi24 = *i24++;
+      const float vk24 = w[49];
+      vacc0p0 += vi24 * vk24;
+
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up2x4-scalar-acc2.c b/src/f32-dwconv/up2x4-scalar-acc2.c
new file mode 100644
index 0000000..4d721a1
--- /dev/null
+++ b/src/f32-dwconv/up2x4-scalar-acc2.c
@@ -0,0 +1,123 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x4__scalar_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      float vacc0p1 = vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      float vacc1p1 = vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p1 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p1 += vi3x1 * vk3x1;
+
+      w += 10;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+      vacc1p0 = vacc1p0 + vacc1p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      float vacc1 = math_max_f32(vacc1p0, vmin);
+
+      vacc0 = math_min_f32(vacc0, vmax);
+      vacc1 = math_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      float vacc0p1 = vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p1 += vi3 * vk3;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up2x4-scalar.c b/src/f32-dwconv/up2x4-scalar.c
new file mode 100644
index 0000000..508c054
--- /dev/null
+++ b/src/f32-dwconv/up2x4-scalar.c
@@ -0,0 +1,118 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x4__scalar(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      vacc0p0 += vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      vacc1p0 += vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p0 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p0 += vi3x1 * vk3x1;
+
+      w += 10;
+
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      float vacc1 = math_max_f32(vacc1p0, vmin);
+
+      vacc0 = math_min_f32(vacc0, vmax);
+      vacc1 = math_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      vacc0p0 += vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p0 += vi3 * vk3;
+
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up2x9-scalar-acc2.c b/src/f32-dwconv/up2x9-scalar-acc2.c
new file mode 100644
index 0000000..ea29fb7
--- /dev/null
+++ b/src/f32-dwconv/up2x9-scalar-acc2.c
@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x9__scalar_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      float vacc0p1 = vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      float vacc1p1 = vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p1 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p1 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p1 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p1 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p1 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p1 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      w += 20;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+      vacc1p0 = vacc1p0 + vacc1p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      float vacc1 = math_max_f32(vacc1p0, vmin);
+
+      vacc0 = math_min_f32(vacc0, vmax);
+      vacc1 = math_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      float vacc0p1 = vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p1 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p1 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p1 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up2x9-scalar.c b/src/f32-dwconv/up2x9-scalar.c
new file mode 100644
index 0000000..812c11b
--- /dev/null
+++ b/src/f32-dwconv/up2x9-scalar.c
@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x9__scalar(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      vacc0p0 += vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      vacc1p0 += vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p0 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p0 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p0 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p0 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p0 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p0 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      w += 20;
+
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      float vacc1 = math_max_f32(vacc1p0, vmin);
+
+      vacc0 = math_min_f32(vacc0, vmax);
+      vacc1 = math_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      vacc0p0 += vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p0 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p0 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p0 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+
+
+      float vacc0 = math_max_f32(vacc0p0, vmin);
+      vacc0 = math_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x25-psimd-acc2.c b/src/f32-dwconv/up4x25-psimd-acc2.c
new file mode 100644
index 0000000..639874d
--- /dev/null
+++ b/src/f32-dwconv/up4x25-psimd-acc2.c
@@ -0,0 +1,349 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x25__psimd_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      i4 += 4;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      i5 += 4;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      i6 += 4;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      i7 += 4;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      i8 += 4;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      i9 += 4;
+
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 40);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      i10 += 4;
+
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      i11 += 4;
+
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 48);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      i12 += 4;
+
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 52);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      i13 += 4;
+
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 56);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      i14 += 4;
+
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      i15 += 4;
+
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 64);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      i16 += 4;
+
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 68);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      i17 += 4;
+
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 72);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      i18 += 4;
+
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      i19 += 4;
+
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 80);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      i20 += 4;
+
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 84);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      i21 += 4;
+
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 88);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      i22 += 4;
+
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 92);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      i23 += 4;
+
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 96);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      i24 += 4;
+
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 100);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
+      w += 104;
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 40);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 48);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 52);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 56);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 64);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 68);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 72);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 80);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 84);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 88);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 92);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 96);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 100);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x25-psimd.c b/src/f32-dwconv/up4x25-psimd.c
index 27ea94f..59be17f 100644
--- a/src/f32-dwconv/up4x25-psimd.c
+++ b/src/f32-dwconv/up4x25-psimd.c
@@ -60,258 +60,282 @@
     size_t c = channels;
     const float* w = weights;
     for (; c >= 4; c -= 4) {
-      psimd_f32 vacc0 = psimd_load_f32(w);
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
 
-      const psimd_f32 vi0 = psimd_load_f32(i0);
-      const psimd_f32 vk0 = psimd_load_f32(w + 4);
-      vacc0 = psimd_qfma_f32(vacc0, vi0, vk0);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
       i0 += 4;
 
-      const psimd_f32 vi1 = psimd_load_f32(i1);
-      const psimd_f32 vk1 = psimd_load_f32(w + 8);
-      psimd_f32 vacc1 = psimd_mul_f32(vi1, vk1);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
       i1 += 4;
 
-      const psimd_f32 vi2 = psimd_load_f32(i2);
-      const psimd_f32 vk2 = psimd_load_f32(w + 12);
-      vacc0 = psimd_qfma_f32(vacc0, vi2, vk2);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
       i2 += 4;
 
-      const psimd_f32 vi3 = psimd_load_f32(i3);
-      const psimd_f32 vk3 = psimd_load_f32(w + 16);
-      vacc1 = psimd_qfma_f32(vacc1, vi3, vk3);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
       i3 += 4;
 
-      const psimd_f32 vi4 = psimd_load_f32(i4);
-      const psimd_f32 vk4 = psimd_load_f32(w + 20);
-      vacc0 = psimd_qfma_f32(vacc0, vi4, vk4);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
       i4 += 4;
 
-      const psimd_f32 vi5 = psimd_load_f32(i5);
-      const psimd_f32 vk5 = psimd_load_f32(w + 24);
-      vacc1 = psimd_qfma_f32(vacc1, vi5, vk5);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
       i5 += 4;
 
-      const psimd_f32 vi6 = psimd_load_f32(i6);
-      const psimd_f32 vk6 = psimd_load_f32(w + 28);
-      vacc0 = psimd_qfma_f32(vacc0, vi6, vk6);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
       i6 += 4;
 
-      const psimd_f32 vi7 = psimd_load_f32(i7);
-      const psimd_f32 vk7 = psimd_load_f32(w + 32);
-      vacc1 = psimd_qfma_f32(vacc1, vi7, vk7);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
       i7 += 4;
 
-      const psimd_f32 vi8 = psimd_load_f32(i8);
-      const psimd_f32 vk8 = psimd_load_f32(w + 36);
-      vacc0 = psimd_qfma_f32(vacc0, vi8, vk8);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
       i8 += 4;
 
-      const psimd_f32 vi9 = psimd_load_f32(i9);
-      const psimd_f32 vk9 = psimd_load_f32(w + 40);
-      vacc1 = psimd_qfma_f32(vacc1, vi9, vk9);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
       i9 += 4;
 
-      const psimd_f32 vi10 = psimd_load_f32(i10);
-      const psimd_f32 vk10 = psimd_load_f32(w + 44);
-      vacc0 = psimd_qfma_f32(vacc0, vi10, vk10);
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
       i10 += 4;
 
-      const psimd_f32 vi11 = psimd_load_f32(i11);
-      const psimd_f32 vk11 = psimd_load_f32(w + 48);
-      vacc1 = psimd_qfma_f32(vacc1, vi11, vk11);
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
       i11 += 4;
 
-      const psimd_f32 vi12 = psimd_load_f32(i12);
-      const psimd_f32 vk12 = psimd_load_f32(w + 52);
-      vacc0 = psimd_qfma_f32(vacc0, vi12, vk12);
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 48);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
       i12 += 4;
 
-      const psimd_f32 vi13 = psimd_load_f32(i13);
-      const psimd_f32 vk13 = psimd_load_f32(w + 56);
-      vacc1 = psimd_qfma_f32(vacc1, vi13, vk13);
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 52);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
       i13 += 4;
 
-      const psimd_f32 vi14 = psimd_load_f32(i14);
-      const psimd_f32 vk14 = psimd_load_f32(w + 60);
-      vacc0 = psimd_qfma_f32(vacc0, vi14, vk14);
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
       i14 += 4;
 
-      const psimd_f32 vi15 = psimd_load_f32(i15);
-      const psimd_f32 vk15 = psimd_load_f32(w + 64);
-      vacc1 = psimd_qfma_f32(vacc1, vi15, vk15);
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
       i15 += 4;
 
-      const psimd_f32 vi16 = psimd_load_f32(i16);
-      const psimd_f32 vk16 = psimd_load_f32(w + 68);
-      vacc0 = psimd_qfma_f32(vacc0, vi16, vk16);
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 64);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
       i16 += 4;
 
-      const psimd_f32 vi17 = psimd_load_f32(i17);
-      const psimd_f32 vk17 = psimd_load_f32(w + 72);
-      vacc1 = psimd_qfma_f32(vacc1, vi17, vk17);
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 68);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
       i17 += 4;
 
-      const psimd_f32 vi18 = psimd_load_f32(i18);
-      const psimd_f32 vk18 = psimd_load_f32(w + 76);
-      vacc0 = psimd_qfma_f32(vacc0, vi18, vk18);
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
       i18 += 4;
 
-      const psimd_f32 vi19 = psimd_load_f32(i19);
-      const psimd_f32 vk19 = psimd_load_f32(w + 80);
-      vacc1 = psimd_qfma_f32(vacc1, vi19, vk19);
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
       i19 += 4;
 
-      const psimd_f32 vi20 = psimd_load_f32(i20);
-      const psimd_f32 vk20 = psimd_load_f32(w + 84);
-      vacc0 = psimd_qfma_f32(vacc0, vi20, vk20);
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 80);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
       i20 += 4;
 
-      const psimd_f32 vi21 = psimd_load_f32(i21);
-      const psimd_f32 vk21 = psimd_load_f32(w + 88);
-      vacc1 = psimd_qfma_f32(vacc1, vi21, vk21);
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 84);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
       i21 += 4;
 
-      const psimd_f32 vi22 = psimd_load_f32(i22);
-      const psimd_f32 vk22 = psimd_load_f32(w + 92);
-      vacc0 = psimd_qfma_f32(vacc0, vi22, vk22);
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 88);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
       i22 += 4;
 
-      const psimd_f32 vi23 = psimd_load_f32(i23);
-      const psimd_f32 vk23 = psimd_load_f32(w + 96);
-      vacc1 = psimd_qfma_f32(vacc1, vi23, vk23);
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 92);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
       i23 += 4;
 
-      const psimd_f32 vi24 = psimd_load_f32(i24);
-      const psimd_f32 vk24 = psimd_load_f32(w + 100);
-      vacc0 = psimd_qfma_f32(vacc0, vi24, vk24);
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 96);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
       i24 += 4;
 
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 100);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
       w += 104;
 
-      vacc0 = psimd_add_f32(vacc0, vacc1);
 
-      vacc0 = psimd_max_f32(vacc0, vmin);
-      vacc0 = psimd_min_f32(vacc0, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
-      psimd_store_f32(output, vacc0);
+      psimd_store_f32(output, vacc0123);
       output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      psimd_f32 vacc = psimd_load_f32(w);
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
 
-      const psimd_f32 vi0 = psimd_load_f32(i0);
-      const psimd_f32 vk0 = psimd_load_f32(w + 4);
-      vacc = psimd_qfma_f32(vacc, vi0, vk0);
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
 
-      const psimd_f32 vi1 = psimd_load_f32(i1);
-      const psimd_f32 vk1 = psimd_load_f32(w + 8);
-      vacc = psimd_qfma_f32(vacc, vi1, vk1);
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
 
-      const psimd_f32 vi2 = psimd_load_f32(i2);
-      const psimd_f32 vk2 = psimd_load_f32(w + 12);
-      vacc = psimd_qfma_f32(vacc, vi2, vk2);
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
 
-      const psimd_f32 vi3 = psimd_load_f32(i3);
-      const psimd_f32 vk3 = psimd_load_f32(w + 16);
-      vacc = psimd_qfma_f32(vacc, vi3, vk3);
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
 
-      const psimd_f32 vi4 = psimd_load_f32(i4);
-      const psimd_f32 vk4 = psimd_load_f32(w + 20);
-      vacc = psimd_qfma_f32(vacc, vi4, vk4);
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
 
-      const psimd_f32 vi5 = psimd_load_f32(i5);
-      const psimd_f32 vk5 = psimd_load_f32(w + 24);
-      vacc = psimd_qfma_f32(vacc, vi5, vk5);
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
 
-      const psimd_f32 vi6 = psimd_load_f32(i6);
-      const psimd_f32 vk6 = psimd_load_f32(w + 28);
-      vacc = psimd_qfma_f32(vacc, vi6, vk6);
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
 
-      const psimd_f32 vi7 = psimd_load_f32(i7);
-      const psimd_f32 vk7 = psimd_load_f32(w + 32);
-      vacc = psimd_qfma_f32(vacc, vi7, vk7);
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
 
-      const psimd_f32 vi8 = psimd_load_f32(i8);
-      const psimd_f32 vk8 = psimd_load_f32(w + 36);
-      vacc = psimd_qfma_f32(vacc, vi8, vk8);
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
 
-      const psimd_f32 vi9 = psimd_load_f32(i9);
-      const psimd_f32 vk9 = psimd_load_f32(w + 40);
-      vacc = psimd_qfma_f32(vacc, vi9, vk9);
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
 
-      const psimd_f32 vi10 = psimd_load_f32(i10);
-      const psimd_f32 vk10 = psimd_load_f32(w + 44);
-      vacc = psimd_qfma_f32(vacc, vi10, vk10);
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
 
-      const psimd_f32 vi11 = psimd_load_f32(i11);
-      const psimd_f32 vk11 = psimd_load_f32(w + 48);
-      vacc = psimd_qfma_f32(vacc, vi11, vk11);
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 48);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
 
-      const psimd_f32 vi12 = psimd_load_f32(i12);
-      const psimd_f32 vk12 = psimd_load_f32(w + 52);
-      vacc = psimd_qfma_f32(vacc, vi12, vk12);
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 52);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
 
-      const psimd_f32 vi13 = psimd_load_f32(i13);
-      const psimd_f32 vk13 = psimd_load_f32(w + 56);
-      vacc = psimd_qfma_f32(vacc, vi13, vk13);
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
 
-      const psimd_f32 vi14 = psimd_load_f32(i14);
-      const psimd_f32 vk14 = psimd_load_f32(w + 60);
-      vacc = psimd_qfma_f32(vacc, vi14, vk14);
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
 
-      const psimd_f32 vi15 = psimd_load_f32(i15);
-      const psimd_f32 vk15 = psimd_load_f32(w + 64);
-      vacc = psimd_qfma_f32(vacc, vi15, vk15);
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 64);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
 
-      const psimd_f32 vi16 = psimd_load_f32(i16);
-      const psimd_f32 vk16 = psimd_load_f32(w + 68);
-      vacc = psimd_qfma_f32(vacc, vi16, vk16);
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 68);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
 
-      const psimd_f32 vi17 = psimd_load_f32(i17);
-      const psimd_f32 vk17 = psimd_load_f32(w + 72);
-      vacc = psimd_qfma_f32(vacc, vi17, vk17);
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
 
-      const psimd_f32 vi18 = psimd_load_f32(i18);
-      const psimd_f32 vk18 = psimd_load_f32(w + 76);
-      vacc = psimd_qfma_f32(vacc, vi18, vk18);
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
 
-      const psimd_f32 vi19 = psimd_load_f32(i19);
-      const psimd_f32 vk19 = psimd_load_f32(w + 80);
-      vacc = psimd_qfma_f32(vacc, vi19, vk19);
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 80);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
 
-      const psimd_f32 vi20 = psimd_load_f32(i20);
-      const psimd_f32 vk20 = psimd_load_f32(w + 84);
-      vacc = psimd_qfma_f32(vacc, vi20, vk20);
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 84);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
 
-      const psimd_f32 vi21 = psimd_load_f32(i21);
-      const psimd_f32 vk21 = psimd_load_f32(w + 88);
-      vacc = psimd_qfma_f32(vacc, vi21, vk21);
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 88);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
 
-      const psimd_f32 vi22 = psimd_load_f32(i22);
-      const psimd_f32 vk22 = psimd_load_f32(w + 92);
-      vacc = psimd_qfma_f32(vacc, vi22, vk22);
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 92);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
 
-      const psimd_f32 vi23 = psimd_load_f32(i23);
-      const psimd_f32 vk23 = psimd_load_f32(w + 96);
-      vacc = psimd_qfma_f32(vacc, vi23, vk23);
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 96);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
 
-      const psimd_f32 vi24 = psimd_load_f32(i24);
-      const psimd_f32 vk24 = psimd_load_f32(w + 100);
-      vacc = psimd_qfma_f32(vacc, vi24, vk24);
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 100);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
 
-      w += 104;
 
-      vacc = psimd_max_f32(vacc, vmin);
-      vacc = psimd_min_f32(vacc, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
       if (c & 2) {
-        psimd_store2_f32(output, vacc);
-        vacc = psimd_concat_hi_f32(vacc, vacc);
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        psimd_store1_f32(output, vacc);
+        psimd_store1_f32(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up4x25-sse-acc2.c b/src/f32-dwconv/up4x25-sse-acc2.c
new file mode 100644
index 0000000..e1185a8
--- /dev/null
+++ b/src/f32-dwconv/up4x25-sse-acc2.c
@@ -0,0 +1,349 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x25__sse_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      i4 += 4;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      i5 += 4;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      i6 += 4;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      i7 += 4;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      i8 += 4;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      i9 += 4;
+
+      const __m128 vk9x0123 = _mm_load_ps(w + 40);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      i10 += 4;
+
+      const __m128 vk10x0123 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      i11 += 4;
+
+      const __m128 vk11x0123 = _mm_load_ps(w + 48);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      i12 += 4;
+
+      const __m128 vk12x0123 = _mm_load_ps(w + 52);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      i13 += 4;
+
+      const __m128 vk13x0123 = _mm_load_ps(w + 56);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      i14 += 4;
+
+      const __m128 vk14x0123 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      i15 += 4;
+
+      const __m128 vk15x0123 = _mm_load_ps(w + 64);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      i16 += 4;
+
+      const __m128 vk16x0123 = _mm_load_ps(w + 68);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      i17 += 4;
+
+      const __m128 vk17x0123 = _mm_load_ps(w + 72);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      i18 += 4;
+
+      const __m128 vk18x0123 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      i19 += 4;
+
+      const __m128 vk19x0123 = _mm_load_ps(w + 80);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      i20 += 4;
+
+      const __m128 vk20x0123 = _mm_load_ps(w + 84);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      i21 += 4;
+
+      const __m128 vk21x0123 = _mm_load_ps(w + 88);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      i22 += 4;
+
+      const __m128 vk22x0123 = _mm_load_ps(w + 92);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      i23 += 4;
+
+      const __m128 vk23x0123 = _mm_load_ps(w + 96);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      i24 += 4;
+
+      const __m128 vk24x0123 = _mm_load_ps(w + 100);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
+      w += 104;
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      const __m128 vk9x0123 = _mm_load_ps(w + 40);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      const __m128 vk10x0123 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      const __m128 vk11x0123 = _mm_load_ps(w + 48);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      const __m128 vk12x0123 = _mm_load_ps(w + 52);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      const __m128 vk13x0123 = _mm_load_ps(w + 56);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      const __m128 vk14x0123 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      const __m128 vk15x0123 = _mm_load_ps(w + 64);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      const __m128 vk16x0123 = _mm_load_ps(w + 68);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      const __m128 vk17x0123 = _mm_load_ps(w + 72);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      const __m128 vk18x0123 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      const __m128 vk19x0123 = _mm_load_ps(w + 80);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      const __m128 vk20x0123 = _mm_load_ps(w + 84);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      const __m128 vk21x0123 = _mm_load_ps(w + 88);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      const __m128 vk22x0123 = _mm_load_ps(w + 92);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      const __m128 vk23x0123 = _mm_load_ps(w + 96);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      const __m128 vk24x0123 = _mm_load_ps(w + 100);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x25-sse.c b/src/f32-dwconv/up4x25-sse.c
index be8b1e9..d232a58 100644
--- a/src/f32-dwconv/up4x25-sse.c
+++ b/src/f32-dwconv/up4x25-sse.c
@@ -60,258 +60,282 @@
     size_t c = channels;
     const float* w = weights;
     for (; c >= 4; c -= 4) {
-      __m128 vacc0 = _mm_load_ps(w);
+      __m128 vacc0123p0 = _mm_load_ps(w);
 
-      const __m128 vi0 = _mm_loadu_ps(i0);
-      const __m128 vk0 = _mm_load_ps(w + 4);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi0, vk0));
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
       i0 += 4;
 
-      const __m128 vi1 = _mm_loadu_ps(i1);
-      const __m128 vk1 = _mm_load_ps(w + 8);
-      __m128 vacc1 = _mm_mul_ps(vi1, vk1);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
       i1 += 4;
 
-      const __m128 vi2 = _mm_loadu_ps(i2);
-      const __m128 vk2 = _mm_load_ps(w + 12);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi2, vk2));
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
       i2 += 4;
 
-      const __m128 vi3 = _mm_loadu_ps(i3);
-      const __m128 vk3 = _mm_load_ps(w + 16);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi3, vk3));
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
       i3 += 4;
 
-      const __m128 vi4 = _mm_loadu_ps(i4);
-      const __m128 vk4 = _mm_load_ps(w + 20);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi4, vk4));
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
       i4 += 4;
 
-      const __m128 vi5 = _mm_loadu_ps(i5);
-      const __m128 vk5 = _mm_load_ps(w + 24);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi5, vk5));
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
       i5 += 4;
 
-      const __m128 vi6 = _mm_loadu_ps(i6);
-      const __m128 vk6 = _mm_load_ps(w + 28);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi6, vk6));
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
       i6 += 4;
 
-      const __m128 vi7 = _mm_loadu_ps(i7);
-      const __m128 vk7 = _mm_load_ps(w + 32);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi7, vk7));
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
       i7 += 4;
 
-      const __m128 vi8 = _mm_loadu_ps(i8);
-      const __m128 vk8 = _mm_load_ps(w + 36);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi8, vk8));
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
       i8 += 4;
 
-      const __m128 vi9 = _mm_loadu_ps(i9);
-      const __m128 vk9 = _mm_load_ps(w + 40);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi9, vk9));
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
       i9 += 4;
 
-      const __m128 vi10 = _mm_loadu_ps(i10);
-      const __m128 vk10 = _mm_load_ps(w + 44);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi10, vk10));
+      const __m128 vk9x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
       i10 += 4;
 
-      const __m128 vi11 = _mm_loadu_ps(i11);
-      const __m128 vk11 = _mm_load_ps(w + 48);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi11, vk11));
+      const __m128 vk10x0123 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
       i11 += 4;
 
-      const __m128 vi12 = _mm_loadu_ps(i12);
-      const __m128 vk12 = _mm_load_ps(w + 52);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi12, vk12));
+      const __m128 vk11x0123 = _mm_load_ps(w + 48);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
       i12 += 4;
 
-      const __m128 vi13 = _mm_loadu_ps(i13);
-      const __m128 vk13 = _mm_load_ps(w + 56);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi13, vk13));
+      const __m128 vk12x0123 = _mm_load_ps(w + 52);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
       i13 += 4;
 
-      const __m128 vi14 = _mm_loadu_ps(i14);
-      const __m128 vk14 = _mm_load_ps(w + 60);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi14, vk14));
+      const __m128 vk13x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
       i14 += 4;
 
-      const __m128 vi15 = _mm_loadu_ps(i15);
-      const __m128 vk15 = _mm_load_ps(w + 64);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi15, vk15));
+      const __m128 vk14x0123 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
       i15 += 4;
 
-      const __m128 vi16 = _mm_loadu_ps(i16);
-      const __m128 vk16 = _mm_load_ps(w + 68);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi16, vk16));
+      const __m128 vk15x0123 = _mm_load_ps(w + 64);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
       i16 += 4;
 
-      const __m128 vi17 = _mm_loadu_ps(i17);
-      const __m128 vk17 = _mm_load_ps(w + 72);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi17, vk17));
+      const __m128 vk16x0123 = _mm_load_ps(w + 68);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
       i17 += 4;
 
-      const __m128 vi18 = _mm_loadu_ps(i18);
-      const __m128 vk18 = _mm_load_ps(w + 76);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi18, vk18));
+      const __m128 vk17x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
       i18 += 4;
 
-      const __m128 vi19 = _mm_loadu_ps(i19);
-      const __m128 vk19 = _mm_load_ps(w + 80);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi19, vk19));
+      const __m128 vk18x0123 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
       i19 += 4;
 
-      const __m128 vi20 = _mm_loadu_ps(i20);
-      const __m128 vk20 = _mm_load_ps(w + 84);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi20, vk20));
+      const __m128 vk19x0123 = _mm_load_ps(w + 80);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
       i20 += 4;
 
-      const __m128 vi21 = _mm_loadu_ps(i21);
-      const __m128 vk21 = _mm_load_ps(w + 88);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi21, vk21));
+      const __m128 vk20x0123 = _mm_load_ps(w + 84);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
       i21 += 4;
 
-      const __m128 vi22 = _mm_loadu_ps(i22);
-      const __m128 vk22 = _mm_load_ps(w + 92);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi22, vk22));
+      const __m128 vk21x0123 = _mm_load_ps(w + 88);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
       i22 += 4;
 
-      const __m128 vi23 = _mm_loadu_ps(i23);
-      const __m128 vk23 = _mm_load_ps(w + 96);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi23, vk23));
+      const __m128 vk22x0123 = _mm_load_ps(w + 92);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
       i23 += 4;
 
-      const __m128 vi24 = _mm_loadu_ps(i24);
-      const __m128 vk24 = _mm_load_ps(w + 100);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi24, vk24));
+      const __m128 vk23x0123 = _mm_load_ps(w + 96);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
       i24 += 4;
 
+      const __m128 vk24x0123 = _mm_load_ps(w + 100);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
       w += 104;
 
-      vacc0 = _mm_add_ps(vacc0, vacc1);
 
-      vacc0 = _mm_max_ps(vacc0, vmin);
-      vacc0 = _mm_min_ps(vacc0, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
-      _mm_storeu_ps(output, vacc0);
+      _mm_storeu_ps(output, vacc0123);
       output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      __m128 vacc = _mm_load_ps(w);
+      __m128 vacc0123p0 = _mm_load_ps(w);
 
-      const __m128 vi0 = _mm_loadu_ps(i0);
-      const __m128 vk0 = _mm_load_ps(w + 4);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi0, vk0));
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
 
-      const __m128 vi1 = _mm_loadu_ps(i1);
-      const __m128 vk1 = _mm_load_ps(w + 8);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi1, vk1));
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
 
-      const __m128 vi2 = _mm_loadu_ps(i2);
-      const __m128 vk2 = _mm_load_ps(w + 12);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi2, vk2));
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
 
-      const __m128 vi3 = _mm_loadu_ps(i3);
-      const __m128 vk3 = _mm_load_ps(w + 16);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi3, vk3));
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
 
-      const __m128 vi4 = _mm_loadu_ps(i4);
-      const __m128 vk4 = _mm_load_ps(w + 20);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi4, vk4));
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
 
-      const __m128 vi5 = _mm_loadu_ps(i5);
-      const __m128 vk5 = _mm_load_ps(w + 24);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi5, vk5));
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
 
-      const __m128 vi6 = _mm_loadu_ps(i6);
-      const __m128 vk6 = _mm_load_ps(w + 28);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi6, vk6));
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
 
-      const __m128 vi7 = _mm_loadu_ps(i7);
-      const __m128 vk7 = _mm_load_ps(w + 32);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi7, vk7));
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
 
-      const __m128 vi8 = _mm_loadu_ps(i8);
-      const __m128 vk8 = _mm_load_ps(w + 36);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi8, vk8));
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
 
-      const __m128 vi9 = _mm_loadu_ps(i9);
-      const __m128 vk9 = _mm_load_ps(w + 40);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi9, vk9));
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      const __m128 vk9x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
 
-      const __m128 vi10 = _mm_loadu_ps(i10);
-      const __m128 vk10 = _mm_load_ps(w + 44);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi10, vk10));
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      const __m128 vk10x0123 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
 
-      const __m128 vi11 = _mm_loadu_ps(i11);
-      const __m128 vk11 = _mm_load_ps(w + 48);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi11, vk11));
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      const __m128 vk11x0123 = _mm_load_ps(w + 48);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
 
-      const __m128 vi12 = _mm_loadu_ps(i12);
-      const __m128 vk12 = _mm_load_ps(w + 52);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi12, vk12));
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      const __m128 vk12x0123 = _mm_load_ps(w + 52);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
 
-      const __m128 vi13 = _mm_loadu_ps(i13);
-      const __m128 vk13 = _mm_load_ps(w + 56);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi13, vk13));
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      const __m128 vk13x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
 
-      const __m128 vi14 = _mm_loadu_ps(i14);
-      const __m128 vk14 = _mm_load_ps(w + 60);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi14, vk14));
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      const __m128 vk14x0123 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
 
-      const __m128 vi15 = _mm_loadu_ps(i15);
-      const __m128 vk15 = _mm_load_ps(w + 64);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi15, vk15));
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      const __m128 vk15x0123 = _mm_load_ps(w + 64);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
 
-      const __m128 vi16 = _mm_loadu_ps(i16);
-      const __m128 vk16 = _mm_load_ps(w + 68);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi16, vk16));
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      const __m128 vk16x0123 = _mm_load_ps(w + 68);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
 
-      const __m128 vi17 = _mm_loadu_ps(i17);
-      const __m128 vk17 = _mm_load_ps(w + 72);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi17, vk17));
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      const __m128 vk17x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
 
-      const __m128 vi18 = _mm_loadu_ps(i18);
-      const __m128 vk18 = _mm_load_ps(w + 76);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi18, vk18));
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      const __m128 vk18x0123 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
 
-      const __m128 vi19 = _mm_loadu_ps(i19);
-      const __m128 vk19 = _mm_load_ps(w + 80);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi19, vk19));
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      const __m128 vk19x0123 = _mm_load_ps(w + 80);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
 
-      const __m128 vi20 = _mm_loadu_ps(i20);
-      const __m128 vk20 = _mm_load_ps(w + 84);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi20, vk20));
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      const __m128 vk20x0123 = _mm_load_ps(w + 84);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
 
-      const __m128 vi21 = _mm_loadu_ps(i21);
-      const __m128 vk21 = _mm_load_ps(w + 88);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi21, vk21));
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      const __m128 vk21x0123 = _mm_load_ps(w + 88);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
 
-      const __m128 vi22 = _mm_loadu_ps(i22);
-      const __m128 vk22 = _mm_load_ps(w + 92);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi22, vk22));
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      const __m128 vk22x0123 = _mm_load_ps(w + 92);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
 
-      const __m128 vi23 = _mm_loadu_ps(i23);
-      const __m128 vk23 = _mm_load_ps(w + 96);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi23, vk23));
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      const __m128 vk23x0123 = _mm_load_ps(w + 96);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
 
-      const __m128 vi24 = _mm_loadu_ps(i24);
-      const __m128 vk24 = _mm_load_ps(w + 100);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi24, vk24));
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      const __m128 vk24x0123 = _mm_load_ps(w + 100);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
 
-      w += 104;
 
-      vacc = _mm_max_ps(vacc, vmin);
-      vacc = _mm_min_ps(vacc, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
       if (c & 2) {
-        _mm_storel_pi((__m64*) output, vacc);
-        vacc = _mm_movehl_ps(vacc, vacc);
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        _mm_store_ss(output, vacc);
+        _mm_store_ss(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up4x4-psimd-acc2.c b/src/f32-dwconv/up4x4-psimd-acc2.c
new file mode 100644
index 0000000..806f8bb
--- /dev/null
+++ b/src/f32-dwconv/up4x4-psimd-acc2.c
@@ -0,0 +1,118 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x4__psimd_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      w += 20;
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x4-psimd.c b/src/f32-dwconv/up4x4-psimd.c
index 204c00f..3dcb768 100644
--- a/src/f32-dwconv/up4x4-psimd.c
+++ b/src/f32-dwconv/up4x4-psimd.c
@@ -39,69 +39,72 @@
     size_t c = channels;
     const float* w = weights;
     for (; c >= 4; c -= 4) {
-      psimd_f32 vacc0 = psimd_load_f32(w);
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
 
-      const psimd_f32 vi0 = psimd_load_f32(i0);
-      const psimd_f32 vk0 = psimd_load_f32(w + 4);
-      vacc0 = psimd_qfma_f32(vacc0, vi0, vk0);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
       i0 += 4;
 
-      const psimd_f32 vi1 = psimd_load_f32(i1);
-      const psimd_f32 vk1 = psimd_load_f32(w + 8);
-      psimd_f32 vacc1 = psimd_mul_f32(vi1, vk1);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
       i1 += 4;
 
-      const psimd_f32 vi2 = psimd_load_f32(i2);
-      const psimd_f32 vk2 = psimd_load_f32(w + 12);
-      vacc0 = psimd_qfma_f32(vacc0, vi2, vk2);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
       i2 += 4;
 
-      const psimd_f32 vi3 = psimd_load_f32(i3);
-      const psimd_f32 vk3 = psimd_load_f32(w + 16);
-      vacc1 = psimd_qfma_f32(vacc1, vi3, vk3);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
       i3 += 4;
 
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
       w += 20;
 
-      vacc0 = psimd_add_f32(vacc0, vacc1);
 
-      vacc0 = psimd_max_f32(vacc0, vmin);
-      vacc0 = psimd_min_f32(vacc0, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
-      psimd_store_f32(output, vacc0);
+      psimd_store_f32(output, vacc0123);
       output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      psimd_f32 vacc = psimd_load_f32(w);
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
 
-      const psimd_f32 vi0 = psimd_load_f32(i0);
-      const psimd_f32 vk0 = psimd_load_f32(w + 4);
-      vacc = psimd_qfma_f32(vacc, vi0, vk0);
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
 
-      const psimd_f32 vi1 = psimd_load_f32(i1);
-      const psimd_f32 vk1 = psimd_load_f32(w + 8);
-      vacc = psimd_qfma_f32(vacc, vi1, vk1);
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
 
-      const psimd_f32 vi2 = psimd_load_f32(i2);
-      const psimd_f32 vk2 = psimd_load_f32(w + 12);
-      vacc = psimd_qfma_f32(vacc, vi2, vk2);
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
 
-      const psimd_f32 vi3 = psimd_load_f32(i3);
-      const psimd_f32 vk3 = psimd_load_f32(w + 16);
-      vacc = psimd_qfma_f32(vacc, vi3, vk3);
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
 
-      w += 20;
 
-      vacc = psimd_max_f32(vacc, vmin);
-      vacc = psimd_min_f32(vacc, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
       if (c & 2) {
-        psimd_store2_f32(output, vacc);
-        vacc = psimd_concat_hi_f32(vacc, vacc);
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        psimd_store1_f32(output, vacc);
+        psimd_store1_f32(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up4x4-sse-acc2.c b/src/f32-dwconv/up4x4-sse-acc2.c
new file mode 100644
index 0000000..38c3905
--- /dev/null
+++ b/src/f32-dwconv/up4x4-sse-acc2.c
@@ -0,0 +1,118 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x4__sse_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      w += 20;
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x4-sse.c b/src/f32-dwconv/up4x4-sse.c
index e2353b1..19de5a4 100644
--- a/src/f32-dwconv/up4x4-sse.c
+++ b/src/f32-dwconv/up4x4-sse.c
@@ -39,69 +39,72 @@
     size_t c = channels;
     const float* w = weights;
     for (; c >= 4; c -= 4) {
-      __m128 vacc0 = _mm_load_ps(w);
+      __m128 vacc0123p0 = _mm_load_ps(w);
 
-      const __m128 vi0 = _mm_loadu_ps(i0);
-      const __m128 vk0 = _mm_load_ps(w + 4);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi0, vk0));
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
       i0 += 4;
 
-      const __m128 vi1 = _mm_loadu_ps(i1);
-      const __m128 vk1 = _mm_load_ps(w + 8);
-      __m128 vacc1 = _mm_mul_ps(vi1, vk1);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
       i1 += 4;
 
-      const __m128 vi2 = _mm_loadu_ps(i2);
-      const __m128 vk2 = _mm_load_ps(w + 12);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi2, vk2));
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
       i2 += 4;
 
-      const __m128 vi3 = _mm_loadu_ps(i3);
-      const __m128 vk3 = _mm_load_ps(w + 16);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi3, vk3));
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
       i3 += 4;
 
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
       w += 20;
 
-      vacc0 = _mm_add_ps(vacc0, vacc1);
 
-      vacc0 = _mm_max_ps(vacc0, vmin);
-      vacc0 = _mm_min_ps(vacc0, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
-      _mm_storeu_ps(output, vacc0);
+      _mm_storeu_ps(output, vacc0123);
       output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      __m128 vacc = _mm_load_ps(w);
+      __m128 vacc0123p0 = _mm_load_ps(w);
 
-      const __m128 vi0 = _mm_loadu_ps(i0);
-      const __m128 vk0 = _mm_load_ps(w + 4);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi0, vk0));
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
 
-      const __m128 vi1 = _mm_loadu_ps(i1);
-      const __m128 vk1 = _mm_load_ps(w + 8);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi1, vk1));
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
 
-      const __m128 vi2 = _mm_loadu_ps(i2);
-      const __m128 vk2 = _mm_load_ps(w + 12);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi2, vk2));
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
 
-      const __m128 vi3 = _mm_loadu_ps(i3);
-      const __m128 vk3 = _mm_load_ps(w + 16);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi3, vk3));
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
 
-      w += 20;
 
-      vacc = _mm_max_ps(vacc, vmin);
-      vacc = _mm_min_ps(vacc, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
       if (c & 2) {
-        _mm_storel_pi((__m64*) output, vacc);
-        vacc = _mm_movehl_ps(vacc, vacc);
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        _mm_store_ss(output, vacc);
+        _mm_store_ss(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up4x9-neon-acc2.c b/src/f32-dwconv/up4x9-neon-acc2.c
new file mode 100644
index 0000000..7de9fab
--- /dev/null
+++ b/src/f32-dwconv/up4x9-neon-acc2.c
@@ -0,0 +1,153 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x9__neon_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+  assert(output_width != 0);
+
+  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
+  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      float32x2_t vacc01 = vget_low_f32(vacc0123);
+      if (c & 2) {
+        vst1_f32(output, vacc01); output += 2;
+        vacc01 = vget_high_f32(vacc0123);
+      }
+      if (c & 1) {
+        vst1_lane_f32(output, vacc01, 0); output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x9-neon.c b/src/f32-dwconv/up4x9-neon.c
index 037d2bf..9339c55 100644
--- a/src/f32-dwconv/up4x9-neon.c
+++ b/src/f32-dwconv/up4x9-neon.c
@@ -25,6 +25,7 @@
     const union xnn_f32_output_params params[restrict static 1])
 {
   assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
   assert(output_width != 0);
 
   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
@@ -90,46 +91,47 @@
       vst1q_f32(output, vacc0123); output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      float32x4_t vacc0123 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
 
 
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
       const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi0x0123, vk0x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
 
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
       const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi1x0123, vk1x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
 
-      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
       const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi2x0123, vk2x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
 
-      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
       const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi3x0123, vk3x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);
 
-      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
       const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi4x0123, vk4x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
 
-      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
       const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi5x0123, vk5x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);
 
-      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
       const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi6x0123, vk6x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
 
-      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
       const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi7x0123, vk7x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);
 
-      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
       const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vmlaq_f32(vacc0123, vi8x0123, vk8x0123);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
 
-      vacc0123 = vmaxq_f32(vacc0123, vmin);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
       vacc0123 = vminq_f32(vacc0123, vmax);
 
       float32x2_t vacc01 = vget_low_f32(vacc0123);
diff --git a/src/f32-dwconv/up4x9-neonfma-acc2.c b/src/f32-dwconv/up4x9-neonfma-acc2.c
new file mode 100644
index 0000000..2cf19f9
--- /dev/null
+++ b/src/f32-dwconv/up4x9-neonfma-acc2.c
@@ -0,0 +1,153 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+  assert(output_width != 0);
+
+  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
+  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      float32x2_t vacc01 = vget_low_f32(vacc0123);
+      if (c & 2) {
+        vst1_f32(output, vacc01); output += 2;
+        vacc01 = vget_high_f32(vacc0123);
+      }
+      if (c & 1) {
+        vst1_lane_f32(output, vacc01, 0); output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x9-neonfma.c b/src/f32-dwconv/up4x9-neonfma.c
index ccbb077..174d73f 100644
--- a/src/f32-dwconv/up4x9-neonfma.c
+++ b/src/f32-dwconv/up4x9-neonfma.c
@@ -25,6 +25,7 @@
     const union xnn_f32_output_params params[restrict static 1])
 {
   assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
   assert(output_width != 0);
 
   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
@@ -90,46 +91,47 @@
       vst1q_f32(output, vacc0123); output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      float32x4_t vacc0123 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
 
 
-      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
       const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi0x0123, vk0x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
 
-      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
       const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi1x0123, vk1x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
 
-      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
       const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi2x0123, vk2x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
 
-      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
       const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi3x0123, vk3x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
 
-      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
       const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi4x0123, vk4x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
 
-      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
       const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi5x0123, vk5x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123);
 
-      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
       const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi6x0123, vk6x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
 
-      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
       const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi7x0123, vk7x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123);
 
-      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
       const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi8x0123, vk8x0123);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
 
-      vacc0123 = vmaxq_f32(vacc0123, vmin);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
       vacc0123 = vminq_f32(vacc0123, vmax);
 
       float32x2_t vacc01 = vget_low_f32(vacc0123);
diff --git a/src/f32-dwconv/up4x9-psimd-acc2.c b/src/f32-dwconv/up4x9-psimd-acc2.c
new file mode 100644
index 0000000..b1d0970
--- /dev/null
+++ b/src/f32-dwconv/up4x9-psimd-acc2.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x9__psimd_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      i4 += 4;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      i5 += 4;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      i6 += 4;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      i7 += 4;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      i8 += 4;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      w += 40;
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x9-psimd.c b/src/f32-dwconv/up4x9-psimd.c
index 8f820b7..6b41fe8 100644
--- a/src/f32-dwconv/up4x9-psimd.c
+++ b/src/f32-dwconv/up4x9-psimd.c
@@ -44,114 +44,122 @@
     size_t c = channels;
     const float* w = weights;
     for (; c >= 4; c -= 4) {
-      psimd_f32 vacc0 = psimd_load_f32(w);
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
 
-      const psimd_f32 vi0 = psimd_load_f32(i0);
-      const psimd_f32 vk0 = psimd_load_f32(w + 4);
-      vacc0 = psimd_qfma_f32(vacc0, vi0, vk0);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
       i0 += 4;
 
-      const psimd_f32 vi1 = psimd_load_f32(i1);
-      const psimd_f32 vk1 = psimd_load_f32(w + 8);
-      psimd_f32 vacc1 = psimd_mul_f32(vi1, vk1);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
       i1 += 4;
 
-      const psimd_f32 vi2 = psimd_load_f32(i2);
-      const psimd_f32 vk2 = psimd_load_f32(w + 12);
-      vacc0 = psimd_qfma_f32(vacc0, vi2, vk2);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
       i2 += 4;
 
-      const psimd_f32 vi3 = psimd_load_f32(i3);
-      const psimd_f32 vk3 = psimd_load_f32(w + 16);
-      vacc1 = psimd_qfma_f32(vacc1, vi3, vk3);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
       i3 += 4;
 
-      const psimd_f32 vi4 = psimd_load_f32(i4);
-      const psimd_f32 vk4 = psimd_load_f32(w + 20);
-      vacc0 = psimd_qfma_f32(vacc0, vi4, vk4);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
       i4 += 4;
 
-      const psimd_f32 vi5 = psimd_load_f32(i5);
-      const psimd_f32 vk5 = psimd_load_f32(w + 24);
-      vacc1 = psimd_qfma_f32(vacc1, vi5, vk5);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
       i5 += 4;
 
-      const psimd_f32 vi6 = psimd_load_f32(i6);
-      const psimd_f32 vk6 = psimd_load_f32(w + 28);
-      vacc0 = psimd_qfma_f32(vacc0, vi6, vk6);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
       i6 += 4;
 
-      const psimd_f32 vi7 = psimd_load_f32(i7);
-      const psimd_f32 vk7 = psimd_load_f32(w + 32);
-      vacc1 = psimd_qfma_f32(vacc1, vi7, vk7);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
       i7 += 4;
 
-      const psimd_f32 vi8 = psimd_load_f32(i8);
-      const psimd_f32 vk8 = psimd_load_f32(w + 36);
-      vacc0 = psimd_qfma_f32(vacc0, vi8, vk8);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
       i8 += 4;
 
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
       w += 40;
 
-      vacc0 = psimd_add_f32(vacc0, vacc1);
 
-      vacc0 = psimd_max_f32(vacc0, vmin);
-      vacc0 = psimd_min_f32(vacc0, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
-      psimd_store_f32(output, vacc0);
+      psimd_store_f32(output, vacc0123);
       output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      psimd_f32 vacc = psimd_load_f32(w);
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
 
-      const psimd_f32 vi0 = psimd_load_f32(i0);
-      const psimd_f32 vk0 = psimd_load_f32(w + 4);
-      vacc = psimd_qfma_f32(vacc, vi0, vk0);
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 4);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
 
-      const psimd_f32 vi1 = psimd_load_f32(i1);
-      const psimd_f32 vk1 = psimd_load_f32(w + 8);
-      vacc = psimd_qfma_f32(vacc, vi1, vk1);
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
 
-      const psimd_f32 vi2 = psimd_load_f32(i2);
-      const psimd_f32 vk2 = psimd_load_f32(w + 12);
-      vacc = psimd_qfma_f32(vacc, vi2, vk2);
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
 
-      const psimd_f32 vi3 = psimd_load_f32(i3);
-      const psimd_f32 vk3 = psimd_load_f32(w + 16);
-      vacc = psimd_qfma_f32(vacc, vi3, vk3);
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
 
-      const psimd_f32 vi4 = psimd_load_f32(i4);
-      const psimd_f32 vk4 = psimd_load_f32(w + 20);
-      vacc = psimd_qfma_f32(vacc, vi4, vk4);
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
 
-      const psimd_f32 vi5 = psimd_load_f32(i5);
-      const psimd_f32 vk5 = psimd_load_f32(w + 24);
-      vacc = psimd_qfma_f32(vacc, vi5, vk5);
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
 
-      const psimd_f32 vi6 = psimd_load_f32(i6);
-      const psimd_f32 vk6 = psimd_load_f32(w + 28);
-      vacc = psimd_qfma_f32(vacc, vi6, vk6);
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
 
-      const psimd_f32 vi7 = psimd_load_f32(i7);
-      const psimd_f32 vk7 = psimd_load_f32(w + 32);
-      vacc = psimd_qfma_f32(vacc, vi7, vk7);
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
 
-      const psimd_f32 vi8 = psimd_load_f32(i8);
-      const psimd_f32 vk8 = psimd_load_f32(w + 36);
-      vacc = psimd_qfma_f32(vacc, vi8, vk8);
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
 
-      w += 40;
 
-      vacc = psimd_max_f32(vacc, vmin);
-      vacc = psimd_min_f32(vacc, vmax);
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
 
       if (c & 2) {
-        psimd_store2_f32(output, vacc);
-        vacc = psimd_concat_hi_f32(vacc, vacc);
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        psimd_store1_f32(output, vacc);
+        psimd_store1_f32(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up4x9-sse-acc2.c b/src/f32-dwconv/up4x9-sse-acc2.c
new file mode 100644
index 0000000..9b1d782
--- /dev/null
+++ b/src/f32-dwconv/up4x9-sse-acc2.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up4x9__sse_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      i4 += 4;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      i5 += 4;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      i6 += 4;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      i7 += 4;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      i8 += 4;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      w += 40;
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up4x9-sse.c b/src/f32-dwconv/up4x9-sse.c
index 6779891..1011f23 100644
--- a/src/f32-dwconv/up4x9-sse.c
+++ b/src/f32-dwconv/up4x9-sse.c
@@ -44,114 +44,122 @@
     size_t c = channels;
     const float* w = weights;
     for (; c >= 4; c -= 4) {
-      __m128 vacc0 = _mm_load_ps(w);
+      __m128 vacc0123p0 = _mm_load_ps(w);
 
-      const __m128 vi0 = _mm_loadu_ps(i0);
-      const __m128 vk0 = _mm_load_ps(w + 4);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi0, vk0));
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
       i0 += 4;
 
-      const __m128 vi1 = _mm_loadu_ps(i1);
-      const __m128 vk1 = _mm_load_ps(w + 8);
-      __m128 vacc1 = _mm_mul_ps(vi1, vk1);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
       i1 += 4;
 
-      const __m128 vi2 = _mm_loadu_ps(i2);
-      const __m128 vk2 = _mm_load_ps(w + 12);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi2, vk2));
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
       i2 += 4;
 
-      const __m128 vi3 = _mm_loadu_ps(i3);
-      const __m128 vk3 = _mm_load_ps(w + 16);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi3, vk3));
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
       i3 += 4;
 
-      const __m128 vi4 = _mm_loadu_ps(i4);
-      const __m128 vk4 = _mm_load_ps(w + 20);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi4, vk4));
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
       i4 += 4;
 
-      const __m128 vi5 = _mm_loadu_ps(i5);
-      const __m128 vk5 = _mm_load_ps(w + 24);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi5, vk5));
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
       i5 += 4;
 
-      const __m128 vi6 = _mm_loadu_ps(i6);
-      const __m128 vk6 = _mm_load_ps(w + 28);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi6, vk6));
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
       i6 += 4;
 
-      const __m128 vi7 = _mm_loadu_ps(i7);
-      const __m128 vk7 = _mm_load_ps(w + 32);
-      vacc1 = _mm_add_ps(vacc1, _mm_mul_ps(vi7, vk7));
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
       i7 += 4;
 
-      const __m128 vi8 = _mm_loadu_ps(i8);
-      const __m128 vk8 = _mm_load_ps(w + 36);
-      vacc0 = _mm_add_ps(vacc0, _mm_mul_ps(vi8, vk8));
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
       i8 += 4;
 
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
       w += 40;
 
-      vacc0 = _mm_add_ps(vacc0, vacc1);
 
-      vacc0 = _mm_max_ps(vacc0, vmin);
-      vacc0 = _mm_min_ps(vacc0, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
-      _mm_storeu_ps(output, vacc0);
+      _mm_storeu_ps(output, vacc0123);
       output += 4;
     }
     if XNN_UNLIKELY(c != 0) {
-      __m128 vacc = _mm_load_ps(w);
+      __m128 vacc0123p0 = _mm_load_ps(w);
 
-      const __m128 vi0 = _mm_loadu_ps(i0);
-      const __m128 vk0 = _mm_load_ps(w + 4);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi0, vk0));
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 4);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
 
-      const __m128 vi1 = _mm_loadu_ps(i1);
-      const __m128 vk1 = _mm_load_ps(w + 8);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi1, vk1));
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
 
-      const __m128 vi2 = _mm_loadu_ps(i2);
-      const __m128 vk2 = _mm_load_ps(w + 12);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi2, vk2));
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
 
-      const __m128 vi3 = _mm_loadu_ps(i3);
-      const __m128 vk3 = _mm_load_ps(w + 16);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi3, vk3));
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
 
-      const __m128 vi4 = _mm_loadu_ps(i4);
-      const __m128 vk4 = _mm_load_ps(w + 20);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi4, vk4));
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
 
-      const __m128 vi5 = _mm_loadu_ps(i5);
-      const __m128 vk5 = _mm_load_ps(w + 24);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi5, vk5));
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
 
-      const __m128 vi6 = _mm_loadu_ps(i6);
-      const __m128 vk6 = _mm_load_ps(w + 28);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi6, vk6));
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
 
-      const __m128 vi7 = _mm_loadu_ps(i7);
-      const __m128 vk7 = _mm_load_ps(w + 32);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi7, vk7));
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
 
-      const __m128 vi8 = _mm_loadu_ps(i8);
-      const __m128 vk8 = _mm_load_ps(w + 36);
-      vacc = _mm_add_ps(vacc, _mm_mul_ps(vi8, vk8));
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
 
-      w += 40;
 
-      vacc = _mm_max_ps(vacc, vmin);
-      vacc = _mm_min_ps(vacc, vmax);
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
 
       if (c & 2) {
-        _mm_storel_pi((__m64*) output, vacc);
-        vacc = _mm_movehl_ps(vacc, vacc);
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
         output += 2;
       }
       if (c & 1) {
-        _mm_store_ss(output, vacc);
+        _mm_store_ss(output, vacc0123);
         output += 1;
       }
     }
diff --git a/src/f32-dwconv/up8x25-psimd-acc2.c b/src/f32-dwconv/up8x25-psimd-acc2.c
new file mode 100644
index 0000000..cb13273
--- /dev/null
+++ b/src/f32-dwconv/up8x25-psimd-acc2.c
@@ -0,0 +1,593 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x25__psimd_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
+      i0 += 8;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
+      i1 += 8;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+      psimd_f32 vacc4567p1 = psimd_mul_f32(vi1x4567, vk1x4567);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
+      i2 += 8;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
+      i3 += 8;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi3x4567, vk3x4567);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vi4x4567 = psimd_load_f32(i4 + 4);
+      i4 += 8;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      const psimd_f32 vk4x4567 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vi5x4567 = psimd_load_f32(i5 + 4);
+      i5 += 8;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      const psimd_f32 vk5x4567 = psimd_load_f32(w + 52);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi5x4567, vk5x4567);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vi6x4567 = psimd_load_f32(i6 + 4);
+      i6 += 8;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      const psimd_f32 vk6x4567 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vi7x4567 = psimd_load_f32(i7 + 4);
+      i7 += 8;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      const psimd_f32 vk7x4567 = psimd_load_f32(w + 68);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi7x4567, vk7x4567);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vi8x4567 = psimd_load_f32(i8 + 4);
+      i8 += 8;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      const psimd_f32 vk8x4567 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      const psimd_f32 vi9x4567 = psimd_load_f32(i9 + 4);
+      i9 += 8;
+
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
+      const psimd_f32 vk9x4567 = psimd_load_f32(w + 84);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi9x4567, vk9x4567);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      const psimd_f32 vi10x4567 = psimd_load_f32(i10 + 4);
+      i10 += 8;
+
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
+      const psimd_f32 vk10x4567 = psimd_load_f32(w + 92);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi10x4567, vk10x4567);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      const psimd_f32 vi11x4567 = psimd_load_f32(i11 + 4);
+      i11 += 8;
+
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
+      const psimd_f32 vk11x4567 = psimd_load_f32(w + 100);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi11x4567, vk11x4567);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      const psimd_f32 vi12x4567 = psimd_load_f32(i12 + 4);
+      i12 += 8;
+
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
+      const psimd_f32 vk12x4567 = psimd_load_f32(w + 108);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi12x4567, vk12x4567);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      const psimd_f32 vi13x4567 = psimd_load_f32(i13 + 4);
+      i13 += 8;
+
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
+      const psimd_f32 vk13x4567 = psimd_load_f32(w + 116);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi13x4567, vk13x4567);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      const psimd_f32 vi14x4567 = psimd_load_f32(i14 + 4);
+      i14 += 8;
+
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
+      const psimd_f32 vk14x4567 = psimd_load_f32(w + 124);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi14x4567, vk14x4567);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      const psimd_f32 vi15x4567 = psimd_load_f32(i15 + 4);
+      i15 += 8;
+
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
+      const psimd_f32 vk15x4567 = psimd_load_f32(w + 132);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi15x4567, vk15x4567);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      const psimd_f32 vi16x4567 = psimd_load_f32(i16 + 4);
+      i16 += 8;
+
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
+      const psimd_f32 vk16x4567 = psimd_load_f32(w + 140);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi16x4567, vk16x4567);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      const psimd_f32 vi17x4567 = psimd_load_f32(i17 + 4);
+      i17 += 8;
+
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
+      const psimd_f32 vk17x4567 = psimd_load_f32(w + 148);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi17x4567, vk17x4567);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      const psimd_f32 vi18x4567 = psimd_load_f32(i18 + 4);
+      i18 += 8;
+
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
+      const psimd_f32 vk18x4567 = psimd_load_f32(w + 156);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi18x4567, vk18x4567);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      const psimd_f32 vi19x4567 = psimd_load_f32(i19 + 4);
+      i19 += 8;
+
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
+      const psimd_f32 vk19x4567 = psimd_load_f32(w + 164);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi19x4567, vk19x4567);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      const psimd_f32 vi20x4567 = psimd_load_f32(i20 + 4);
+      i20 += 8;
+
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
+      const psimd_f32 vk20x4567 = psimd_load_f32(w + 172);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi20x4567, vk20x4567);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      const psimd_f32 vi21x4567 = psimd_load_f32(i21 + 4);
+      i21 += 8;
+
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
+      const psimd_f32 vk21x4567 = psimd_load_f32(w + 180);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi21x4567, vk21x4567);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      const psimd_f32 vi22x4567 = psimd_load_f32(i22 + 4);
+      i22 += 8;
+
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
+      const psimd_f32 vk22x4567 = psimd_load_f32(w + 188);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi22x4567, vk22x4567);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      const psimd_f32 vi23x4567 = psimd_load_f32(i23 + 4);
+      i23 += 8;
+
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
+      const psimd_f32 vk23x4567 = psimd_load_f32(w + 196);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi23x4567, vk23x4567);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      const psimd_f32 vi24x4567 = psimd_load_f32(i24 + 4);
+      i24 += 8;
+
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
+      const psimd_f32 vk24x4567 = psimd_load_f32(w + 204);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi24x4567, vk24x4567);
+
+      w += 208;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+      vacc4567p0 = psimd_add_f32(vacc4567p0, vacc4567p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+      vacc4567 = psimd_min_f32(vacc4567, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      psimd_store_f32(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      i4 += 4;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      i5 += 4;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      i6 += 4;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      i7 += 4;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      i8 += 4;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      i9 += 4;
+
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      i10 += 4;
+
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      i11 += 4;
+
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      i12 += 4;
+
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      i13 += 4;
+
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      i14 += 4;
+
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      i15 += 4;
+
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      i16 += 4;
+
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      i17 += 4;
+
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      i18 += 4;
+
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      i19 += 4;
+
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      i20 += 4;
+
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      i21 += 4;
+
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      i22 += 4;
+
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      i23 += 4;
+
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      i24 += 4;
+
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
+      w += 4;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x25-psimd.c b/src/f32-dwconv/up8x25-psimd.c
new file mode 100644
index 0000000..20a60a6
--- /dev/null
+++ b/src/f32-dwconv/up8x25-psimd.c
@@ -0,0 +1,586 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x25__psimd(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
+      i0 += 8;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
+      i1 += 8;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi1x4567, vk1x4567);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
+      i2 += 8;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
+      i3 += 8;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi3x4567, vk3x4567);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vi4x4567 = psimd_load_f32(i4 + 4);
+      i4 += 8;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      const psimd_f32 vk4x4567 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vi5x4567 = psimd_load_f32(i5 + 4);
+      i5 += 8;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      const psimd_f32 vk5x4567 = psimd_load_f32(w + 52);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi5x4567, vk5x4567);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vi6x4567 = psimd_load_f32(i6 + 4);
+      i6 += 8;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      const psimd_f32 vk6x4567 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vi7x4567 = psimd_load_f32(i7 + 4);
+      i7 += 8;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      const psimd_f32 vk7x4567 = psimd_load_f32(w + 68);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi7x4567, vk7x4567);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vi8x4567 = psimd_load_f32(i8 + 4);
+      i8 += 8;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      const psimd_f32 vk8x4567 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      const psimd_f32 vi9x4567 = psimd_load_f32(i9 + 4);
+      i9 += 8;
+
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
+      const psimd_f32 vk9x4567 = psimd_load_f32(w + 84);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi9x4567, vk9x4567);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      const psimd_f32 vi10x4567 = psimd_load_f32(i10 + 4);
+      i10 += 8;
+
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
+      const psimd_f32 vk10x4567 = psimd_load_f32(w + 92);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi10x4567, vk10x4567);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      const psimd_f32 vi11x4567 = psimd_load_f32(i11 + 4);
+      i11 += 8;
+
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
+      const psimd_f32 vk11x4567 = psimd_load_f32(w + 100);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi11x4567, vk11x4567);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      const psimd_f32 vi12x4567 = psimd_load_f32(i12 + 4);
+      i12 += 8;
+
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
+      const psimd_f32 vk12x4567 = psimd_load_f32(w + 108);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi12x4567, vk12x4567);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      const psimd_f32 vi13x4567 = psimd_load_f32(i13 + 4);
+      i13 += 8;
+
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
+      const psimd_f32 vk13x4567 = psimd_load_f32(w + 116);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi13x4567, vk13x4567);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      const psimd_f32 vi14x4567 = psimd_load_f32(i14 + 4);
+      i14 += 8;
+
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
+      const psimd_f32 vk14x4567 = psimd_load_f32(w + 124);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi14x4567, vk14x4567);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      const psimd_f32 vi15x4567 = psimd_load_f32(i15 + 4);
+      i15 += 8;
+
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
+      const psimd_f32 vk15x4567 = psimd_load_f32(w + 132);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi15x4567, vk15x4567);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      const psimd_f32 vi16x4567 = psimd_load_f32(i16 + 4);
+      i16 += 8;
+
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
+      const psimd_f32 vk16x4567 = psimd_load_f32(w + 140);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi16x4567, vk16x4567);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      const psimd_f32 vi17x4567 = psimd_load_f32(i17 + 4);
+      i17 += 8;
+
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
+      const psimd_f32 vk17x4567 = psimd_load_f32(w + 148);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi17x4567, vk17x4567);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      const psimd_f32 vi18x4567 = psimd_load_f32(i18 + 4);
+      i18 += 8;
+
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
+      const psimd_f32 vk18x4567 = psimd_load_f32(w + 156);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi18x4567, vk18x4567);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      const psimd_f32 vi19x4567 = psimd_load_f32(i19 + 4);
+      i19 += 8;
+
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
+      const psimd_f32 vk19x4567 = psimd_load_f32(w + 164);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi19x4567, vk19x4567);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      const psimd_f32 vi20x4567 = psimd_load_f32(i20 + 4);
+      i20 += 8;
+
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
+      const psimd_f32 vk20x4567 = psimd_load_f32(w + 172);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi20x4567, vk20x4567);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      const psimd_f32 vi21x4567 = psimd_load_f32(i21 + 4);
+      i21 += 8;
+
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
+      const psimd_f32 vk21x4567 = psimd_load_f32(w + 180);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi21x4567, vk21x4567);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      const psimd_f32 vi22x4567 = psimd_load_f32(i22 + 4);
+      i22 += 8;
+
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
+      const psimd_f32 vk22x4567 = psimd_load_f32(w + 188);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi22x4567, vk22x4567);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      const psimd_f32 vi23x4567 = psimd_load_f32(i23 + 4);
+      i23 += 8;
+
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
+      const psimd_f32 vk23x4567 = psimd_load_f32(w + 196);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi23x4567, vk23x4567);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      const psimd_f32 vi24x4567 = psimd_load_f32(i24 + 4);
+      i24 += 8;
+
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
+      const psimd_f32 vk24x4567 = psimd_load_f32(w + 204);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi24x4567, vk24x4567);
+
+      w += 208;
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+      vacc4567 = psimd_min_f32(vacc4567, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      psimd_store_f32(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      i4 += 4;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      i5 += 4;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      i6 += 4;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      i7 += 4;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      i8 += 4;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      i9 += 4;
+
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      i10 += 4;
+
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      i11 += 4;
+
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      i12 += 4;
+
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      i13 += 4;
+
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      i14 += 4;
+
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      i15 += 4;
+
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      i16 += 4;
+
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      i17 += 4;
+
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      i18 += 4;
+
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      i19 += 4;
+
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      i20 += 4;
+
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      i21 += 4;
+
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      i22 += 4;
+
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      i23 += 4;
+
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      i24 += 4;
+
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
+      w += 4;
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      const psimd_f32 vi9x0123 = psimd_load_f32(i9);
+      const psimd_f32 vk9x0123 = psimd_load_f32(w + 80);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi9x0123, vk9x0123);
+
+      const psimd_f32 vi10x0123 = psimd_load_f32(i10);
+      const psimd_f32 vk10x0123 = psimd_load_f32(w + 88);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi10x0123, vk10x0123);
+
+      const psimd_f32 vi11x0123 = psimd_load_f32(i11);
+      const psimd_f32 vk11x0123 = psimd_load_f32(w + 96);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi11x0123, vk11x0123);
+
+      const psimd_f32 vi12x0123 = psimd_load_f32(i12);
+      const psimd_f32 vk12x0123 = psimd_load_f32(w + 104);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi12x0123, vk12x0123);
+
+      const psimd_f32 vi13x0123 = psimd_load_f32(i13);
+      const psimd_f32 vk13x0123 = psimd_load_f32(w + 112);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi13x0123, vk13x0123);
+
+      const psimd_f32 vi14x0123 = psimd_load_f32(i14);
+      const psimd_f32 vk14x0123 = psimd_load_f32(w + 120);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi14x0123, vk14x0123);
+
+      const psimd_f32 vi15x0123 = psimd_load_f32(i15);
+      const psimd_f32 vk15x0123 = psimd_load_f32(w + 128);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi15x0123, vk15x0123);
+
+      const psimd_f32 vi16x0123 = psimd_load_f32(i16);
+      const psimd_f32 vk16x0123 = psimd_load_f32(w + 136);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi16x0123, vk16x0123);
+
+      const psimd_f32 vi17x0123 = psimd_load_f32(i17);
+      const psimd_f32 vk17x0123 = psimd_load_f32(w + 144);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi17x0123, vk17x0123);
+
+      const psimd_f32 vi18x0123 = psimd_load_f32(i18);
+      const psimd_f32 vk18x0123 = psimd_load_f32(w + 152);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi18x0123, vk18x0123);
+
+      const psimd_f32 vi19x0123 = psimd_load_f32(i19);
+      const psimd_f32 vk19x0123 = psimd_load_f32(w + 160);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi19x0123, vk19x0123);
+
+      const psimd_f32 vi20x0123 = psimd_load_f32(i20);
+      const psimd_f32 vk20x0123 = psimd_load_f32(w + 168);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi20x0123, vk20x0123);
+
+      const psimd_f32 vi21x0123 = psimd_load_f32(i21);
+      const psimd_f32 vk21x0123 = psimd_load_f32(w + 176);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi21x0123, vk21x0123);
+
+      const psimd_f32 vi22x0123 = psimd_load_f32(i22);
+      const psimd_f32 vk22x0123 = psimd_load_f32(w + 184);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi22x0123, vk22x0123);
+
+      const psimd_f32 vi23x0123 = psimd_load_f32(i23);
+      const psimd_f32 vk23x0123 = psimd_load_f32(w + 192);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi23x0123, vk23x0123);
+
+      const psimd_f32 vi24x0123 = psimd_load_f32(i24);
+      const psimd_f32 vk24x0123 = psimd_load_f32(w + 200);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi24x0123, vk24x0123);
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x25-sse-acc2.c b/src/f32-dwconv/up8x25-sse-acc2.c
new file mode 100644
index 0000000..3181ecc
--- /dev/null
+++ b/src/f32-dwconv/up8x25-sse-acc2.c
@@ -0,0 +1,593 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x25__sse_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      __m128 vacc4567p0 = _mm_load_ps(w + 4);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
+      i0 += 8;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      const __m128 vk0x4567 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
+      i1 += 8;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      const __m128 vk1x4567 = _mm_load_ps(w + 20);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+      __m128 vacc4567p1 = _mm_mul_ps(vi1x4567, vk1x4567);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
+      i2 += 8;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      const __m128 vk2x4567 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
+      i3 += 8;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      const __m128 vk3x4567 = _mm_load_ps(w + 36);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi3x4567, vk3x4567));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
+      i4 += 8;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      const __m128 vk4x4567 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
+      i5 += 8;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      const __m128 vk5x4567 = _mm_load_ps(w + 52);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi5x4567, vk5x4567));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
+      i6 += 8;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      const __m128 vk6x4567 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
+      i7 += 8;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      const __m128 vk7x4567 = _mm_load_ps(w + 68);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi7x4567, vk7x4567));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
+      i8 += 8;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      const __m128 vk8x4567 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
+      i9 += 8;
+
+      const __m128 vk9x0123 = _mm_load_ps(w + 80);
+      const __m128 vk9x4567 = _mm_load_ps(w + 84);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi9x4567, vk9x4567));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
+      i10 += 8;
+
+      const __m128 vk10x0123 = _mm_load_ps(w + 88);
+      const __m128 vk10x4567 = _mm_load_ps(w + 92);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
+      i11 += 8;
+
+      const __m128 vk11x0123 = _mm_load_ps(w + 96);
+      const __m128 vk11x4567 = _mm_load_ps(w + 100);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi11x4567, vk11x4567));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
+      i12 += 8;
+
+      const __m128 vk12x0123 = _mm_load_ps(w + 104);
+      const __m128 vk12x4567 = _mm_load_ps(w + 108);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
+      i13 += 8;
+
+      const __m128 vk13x0123 = _mm_load_ps(w + 112);
+      const __m128 vk13x4567 = _mm_load_ps(w + 116);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi13x4567, vk13x4567));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
+      i14 += 8;
+
+      const __m128 vk14x0123 = _mm_load_ps(w + 120);
+      const __m128 vk14x4567 = _mm_load_ps(w + 124);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
+      i15 += 8;
+
+      const __m128 vk15x0123 = _mm_load_ps(w + 128);
+      const __m128 vk15x4567 = _mm_load_ps(w + 132);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi15x4567, vk15x4567));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
+      i16 += 8;
+
+      const __m128 vk16x0123 = _mm_load_ps(w + 136);
+      const __m128 vk16x4567 = _mm_load_ps(w + 140);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
+      i17 += 8;
+
+      const __m128 vk17x0123 = _mm_load_ps(w + 144);
+      const __m128 vk17x4567 = _mm_load_ps(w + 148);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi17x4567, vk17x4567));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
+      i18 += 8;
+
+      const __m128 vk18x0123 = _mm_load_ps(w + 152);
+      const __m128 vk18x4567 = _mm_load_ps(w + 156);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
+      i19 += 8;
+
+      const __m128 vk19x0123 = _mm_load_ps(w + 160);
+      const __m128 vk19x4567 = _mm_load_ps(w + 164);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi19x4567, vk19x4567));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
+      i20 += 8;
+
+      const __m128 vk20x0123 = _mm_load_ps(w + 168);
+      const __m128 vk20x4567 = _mm_load_ps(w + 172);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
+      i21 += 8;
+
+      const __m128 vk21x0123 = _mm_load_ps(w + 176);
+      const __m128 vk21x4567 = _mm_load_ps(w + 180);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi21x4567, vk21x4567));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
+      i22 += 8;
+
+      const __m128 vk22x0123 = _mm_load_ps(w + 184);
+      const __m128 vk22x4567 = _mm_load_ps(w + 188);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
+      i23 += 8;
+
+      const __m128 vk23x0123 = _mm_load_ps(w + 192);
+      const __m128 vk23x4567 = _mm_load_ps(w + 196);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi23x4567, vk23x4567));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
+      i24 += 8;
+
+      const __m128 vk24x0123 = _mm_load_ps(w + 200);
+      const __m128 vk24x4567 = _mm_load_ps(w + 204);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
+
+      w += 208;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+      vacc4567p0 = _mm_add_ps(vacc4567p0, vacc4567p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+      vacc4567 = _mm_min_ps(vacc4567, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      _mm_storeu_ps(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      i4 += 4;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      i5 += 4;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      i6 += 4;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      i7 += 4;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      i8 += 4;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      i9 += 4;
+
+      const __m128 vk9x0123 = _mm_load_ps(w + 80);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      i10 += 4;
+
+      const __m128 vk10x0123 = _mm_load_ps(w + 88);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      i11 += 4;
+
+      const __m128 vk11x0123 = _mm_load_ps(w + 96);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      i12 += 4;
+
+      const __m128 vk12x0123 = _mm_load_ps(w + 104);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      i13 += 4;
+
+      const __m128 vk13x0123 = _mm_load_ps(w + 112);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      i14 += 4;
+
+      const __m128 vk14x0123 = _mm_load_ps(w + 120);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      i15 += 4;
+
+      const __m128 vk15x0123 = _mm_load_ps(w + 128);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      i16 += 4;
+
+      const __m128 vk16x0123 = _mm_load_ps(w + 136);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      i17 += 4;
+
+      const __m128 vk17x0123 = _mm_load_ps(w + 144);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      i18 += 4;
+
+      const __m128 vk18x0123 = _mm_load_ps(w + 152);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      i19 += 4;
+
+      const __m128 vk19x0123 = _mm_load_ps(w + 160);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      i20 += 4;
+
+      const __m128 vk20x0123 = _mm_load_ps(w + 168);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      i21 += 4;
+
+      const __m128 vk21x0123 = _mm_load_ps(w + 176);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      i22 += 4;
+
+      const __m128 vk22x0123 = _mm_load_ps(w + 184);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      i23 += 4;
+
+      const __m128 vk23x0123 = _mm_load_ps(w + 192);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      i24 += 4;
+
+      const __m128 vk24x0123 = _mm_load_ps(w + 200);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
+      w += 4;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      const __m128 vk9x0123 = _mm_load_ps(w + 80);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      const __m128 vk10x0123 = _mm_load_ps(w + 88);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      const __m128 vk11x0123 = _mm_load_ps(w + 96);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      const __m128 vk12x0123 = _mm_load_ps(w + 104);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      const __m128 vk13x0123 = _mm_load_ps(w + 112);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      const __m128 vk14x0123 = _mm_load_ps(w + 120);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      const __m128 vk15x0123 = _mm_load_ps(w + 128);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      const __m128 vk16x0123 = _mm_load_ps(w + 136);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      const __m128 vk17x0123 = _mm_load_ps(w + 144);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      const __m128 vk18x0123 = _mm_load_ps(w + 152);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      const __m128 vk19x0123 = _mm_load_ps(w + 160);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      const __m128 vk20x0123 = _mm_load_ps(w + 168);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      const __m128 vk21x0123 = _mm_load_ps(w + 176);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      const __m128 vk22x0123 = _mm_load_ps(w + 184);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      const __m128 vk23x0123 = _mm_load_ps(w + 192);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      const __m128 vk24x0123 = _mm_load_ps(w + 200);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x25-sse.c b/src/f32-dwconv/up8x25-sse.c
new file mode 100644
index 0000000..1943a5f
--- /dev/null
+++ b/src/f32-dwconv/up8x25-sse.c
@@ -0,0 +1,586 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x25__sse(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      __m128 vacc4567p0 = _mm_load_ps(w + 4);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
+      i0 += 8;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      const __m128 vk0x4567 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
+      i1 += 8;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      const __m128 vk1x4567 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
+      i2 += 8;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      const __m128 vk2x4567 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
+      i3 += 8;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      const __m128 vk3x4567 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
+      i4 += 8;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      const __m128 vk4x4567 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
+      i5 += 8;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      const __m128 vk5x4567 = _mm_load_ps(w + 52);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
+      i6 += 8;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      const __m128 vk6x4567 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
+      i7 += 8;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      const __m128 vk7x4567 = _mm_load_ps(w + 68);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
+      i8 += 8;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      const __m128 vk8x4567 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      const __m128 vi9x4567 = _mm_loadu_ps(i9 + 4);
+      i9 += 8;
+
+      const __m128 vk9x0123 = _mm_load_ps(w + 80);
+      const __m128 vk9x4567 = _mm_load_ps(w + 84);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi9x4567, vk9x4567));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      const __m128 vi10x4567 = _mm_loadu_ps(i10 + 4);
+      i10 += 8;
+
+      const __m128 vk10x0123 = _mm_load_ps(w + 88);
+      const __m128 vk10x4567 = _mm_load_ps(w + 92);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi10x4567, vk10x4567));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      const __m128 vi11x4567 = _mm_loadu_ps(i11 + 4);
+      i11 += 8;
+
+      const __m128 vk11x0123 = _mm_load_ps(w + 96);
+      const __m128 vk11x4567 = _mm_load_ps(w + 100);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi11x4567, vk11x4567));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      const __m128 vi12x4567 = _mm_loadu_ps(i12 + 4);
+      i12 += 8;
+
+      const __m128 vk12x0123 = _mm_load_ps(w + 104);
+      const __m128 vk12x4567 = _mm_load_ps(w + 108);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi12x4567, vk12x4567));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      const __m128 vi13x4567 = _mm_loadu_ps(i13 + 4);
+      i13 += 8;
+
+      const __m128 vk13x0123 = _mm_load_ps(w + 112);
+      const __m128 vk13x4567 = _mm_load_ps(w + 116);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi13x4567, vk13x4567));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      const __m128 vi14x4567 = _mm_loadu_ps(i14 + 4);
+      i14 += 8;
+
+      const __m128 vk14x0123 = _mm_load_ps(w + 120);
+      const __m128 vk14x4567 = _mm_load_ps(w + 124);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi14x4567, vk14x4567));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      const __m128 vi15x4567 = _mm_loadu_ps(i15 + 4);
+      i15 += 8;
+
+      const __m128 vk15x0123 = _mm_load_ps(w + 128);
+      const __m128 vk15x4567 = _mm_load_ps(w + 132);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi15x4567, vk15x4567));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      const __m128 vi16x4567 = _mm_loadu_ps(i16 + 4);
+      i16 += 8;
+
+      const __m128 vk16x0123 = _mm_load_ps(w + 136);
+      const __m128 vk16x4567 = _mm_load_ps(w + 140);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi16x4567, vk16x4567));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      const __m128 vi17x4567 = _mm_loadu_ps(i17 + 4);
+      i17 += 8;
+
+      const __m128 vk17x0123 = _mm_load_ps(w + 144);
+      const __m128 vk17x4567 = _mm_load_ps(w + 148);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi17x4567, vk17x4567));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      const __m128 vi18x4567 = _mm_loadu_ps(i18 + 4);
+      i18 += 8;
+
+      const __m128 vk18x0123 = _mm_load_ps(w + 152);
+      const __m128 vk18x4567 = _mm_load_ps(w + 156);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi18x4567, vk18x4567));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      const __m128 vi19x4567 = _mm_loadu_ps(i19 + 4);
+      i19 += 8;
+
+      const __m128 vk19x0123 = _mm_load_ps(w + 160);
+      const __m128 vk19x4567 = _mm_load_ps(w + 164);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi19x4567, vk19x4567));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      const __m128 vi20x4567 = _mm_loadu_ps(i20 + 4);
+      i20 += 8;
+
+      const __m128 vk20x0123 = _mm_load_ps(w + 168);
+      const __m128 vk20x4567 = _mm_load_ps(w + 172);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi20x4567, vk20x4567));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      const __m128 vi21x4567 = _mm_loadu_ps(i21 + 4);
+      i21 += 8;
+
+      const __m128 vk21x0123 = _mm_load_ps(w + 176);
+      const __m128 vk21x4567 = _mm_load_ps(w + 180);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi21x4567, vk21x4567));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      const __m128 vi22x4567 = _mm_loadu_ps(i22 + 4);
+      i22 += 8;
+
+      const __m128 vk22x0123 = _mm_load_ps(w + 184);
+      const __m128 vk22x4567 = _mm_load_ps(w + 188);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi22x4567, vk22x4567));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      const __m128 vi23x4567 = _mm_loadu_ps(i23 + 4);
+      i23 += 8;
+
+      const __m128 vk23x0123 = _mm_load_ps(w + 192);
+      const __m128 vk23x4567 = _mm_load_ps(w + 196);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi23x4567, vk23x4567));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      const __m128 vi24x4567 = _mm_loadu_ps(i24 + 4);
+      i24 += 8;
+
+      const __m128 vk24x0123 = _mm_load_ps(w + 200);
+      const __m128 vk24x4567 = _mm_load_ps(w + 204);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi24x4567, vk24x4567));
+
+      w += 208;
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+      vacc4567 = _mm_min_ps(vacc4567, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      _mm_storeu_ps(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      i4 += 4;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      i5 += 4;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      i6 += 4;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      i7 += 4;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      i8 += 4;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      i9 += 4;
+
+      const __m128 vk9x0123 = _mm_load_ps(w + 80);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      i10 += 4;
+
+      const __m128 vk10x0123 = _mm_load_ps(w + 88);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      i11 += 4;
+
+      const __m128 vk11x0123 = _mm_load_ps(w + 96);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      i12 += 4;
+
+      const __m128 vk12x0123 = _mm_load_ps(w + 104);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      i13 += 4;
+
+      const __m128 vk13x0123 = _mm_load_ps(w + 112);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      i14 += 4;
+
+      const __m128 vk14x0123 = _mm_load_ps(w + 120);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      i15 += 4;
+
+      const __m128 vk15x0123 = _mm_load_ps(w + 128);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      i16 += 4;
+
+      const __m128 vk16x0123 = _mm_load_ps(w + 136);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      i17 += 4;
+
+      const __m128 vk17x0123 = _mm_load_ps(w + 144);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      i18 += 4;
+
+      const __m128 vk18x0123 = _mm_load_ps(w + 152);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      i19 += 4;
+
+      const __m128 vk19x0123 = _mm_load_ps(w + 160);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      i20 += 4;
+
+      const __m128 vk20x0123 = _mm_load_ps(w + 168);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      i21 += 4;
+
+      const __m128 vk21x0123 = _mm_load_ps(w + 176);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      i22 += 4;
+
+      const __m128 vk22x0123 = _mm_load_ps(w + 184);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      i23 += 4;
+
+      const __m128 vk23x0123 = _mm_load_ps(w + 192);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      i24 += 4;
+
+      const __m128 vk24x0123 = _mm_load_ps(w + 200);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
+      w += 4;
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      const __m128 vi9x0123 = _mm_loadu_ps(i9);
+      const __m128 vk9x0123 = _mm_load_ps(w + 80);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi9x0123, vk9x0123));
+
+      const __m128 vi10x0123 = _mm_loadu_ps(i10);
+      const __m128 vk10x0123 = _mm_load_ps(w + 88);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi10x0123, vk10x0123));
+
+      const __m128 vi11x0123 = _mm_loadu_ps(i11);
+      const __m128 vk11x0123 = _mm_load_ps(w + 96);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi11x0123, vk11x0123));
+
+      const __m128 vi12x0123 = _mm_loadu_ps(i12);
+      const __m128 vk12x0123 = _mm_load_ps(w + 104);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi12x0123, vk12x0123));
+
+      const __m128 vi13x0123 = _mm_loadu_ps(i13);
+      const __m128 vk13x0123 = _mm_load_ps(w + 112);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi13x0123, vk13x0123));
+
+      const __m128 vi14x0123 = _mm_loadu_ps(i14);
+      const __m128 vk14x0123 = _mm_load_ps(w + 120);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi14x0123, vk14x0123));
+
+      const __m128 vi15x0123 = _mm_loadu_ps(i15);
+      const __m128 vk15x0123 = _mm_load_ps(w + 128);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi15x0123, vk15x0123));
+
+      const __m128 vi16x0123 = _mm_loadu_ps(i16);
+      const __m128 vk16x0123 = _mm_load_ps(w + 136);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi16x0123, vk16x0123));
+
+      const __m128 vi17x0123 = _mm_loadu_ps(i17);
+      const __m128 vk17x0123 = _mm_load_ps(w + 144);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi17x0123, vk17x0123));
+
+      const __m128 vi18x0123 = _mm_loadu_ps(i18);
+      const __m128 vk18x0123 = _mm_load_ps(w + 152);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi18x0123, vk18x0123));
+
+      const __m128 vi19x0123 = _mm_loadu_ps(i19);
+      const __m128 vk19x0123 = _mm_load_ps(w + 160);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi19x0123, vk19x0123));
+
+      const __m128 vi20x0123 = _mm_loadu_ps(i20);
+      const __m128 vk20x0123 = _mm_load_ps(w + 168);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi20x0123, vk20x0123));
+
+      const __m128 vi21x0123 = _mm_loadu_ps(i21);
+      const __m128 vk21x0123 = _mm_load_ps(w + 176);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi21x0123, vk21x0123));
+
+      const __m128 vi22x0123 = _mm_loadu_ps(i22);
+      const __m128 vk22x0123 = _mm_load_ps(w + 184);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi22x0123, vk22x0123));
+
+      const __m128 vi23x0123 = _mm_loadu_ps(i23);
+      const __m128 vk23x0123 = _mm_load_ps(w + 192);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi23x0123, vk23x0123));
+
+      const __m128 vi24x0123 = _mm_loadu_ps(i24);
+      const __m128 vk24x0123 = _mm_load_ps(w + 200);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi24x0123, vk24x0123));
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x4-psimd-acc2.c b/src/f32-dwconv/up8x4-psimd-acc2.c
new file mode 100644
index 0000000..be51137
--- /dev/null
+++ b/src/f32-dwconv/up8x4-psimd-acc2.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x4__psimd_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
+      i0 += 8;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
+      i1 += 8;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+      psimd_f32 vacc4567p1 = psimd_mul_f32(vi1x4567, vk1x4567);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
+      i2 += 8;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
+      i3 += 8;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi3x4567, vk3x4567);
+
+      w += 40;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+      vacc4567p0 = psimd_add_f32(vacc4567p0, vacc4567p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+      vacc4567 = psimd_min_f32(vacc4567, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      psimd_store_f32(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      w += 4;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x4-psimd.c b/src/f32-dwconv/up8x4-psimd.c
new file mode 100644
index 0000000..d54e49c
--- /dev/null
+++ b/src/f32-dwconv/up8x4-psimd.c
@@ -0,0 +1,166 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x4__psimd(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
+      i0 += 8;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
+      i1 += 8;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi1x4567, vk1x4567);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
+      i2 += 8;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
+      i3 += 8;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi3x4567, vk3x4567);
+
+      w += 40;
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+      vacc4567 = psimd_min_f32(vacc4567, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      psimd_store_f32(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      w += 4;
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x4-sse-acc2.c b/src/f32-dwconv/up8x4-sse-acc2.c
new file mode 100644
index 0000000..e3251fe
--- /dev/null
+++ b/src/f32-dwconv/up8x4-sse-acc2.c
@@ -0,0 +1,173 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x4__sse_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      __m128 vacc4567p0 = _mm_load_ps(w + 4);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
+      i0 += 8;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      const __m128 vk0x4567 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
+      i1 += 8;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      const __m128 vk1x4567 = _mm_load_ps(w + 20);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+      __m128 vacc4567p1 = _mm_mul_ps(vi1x4567, vk1x4567);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
+      i2 += 8;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      const __m128 vk2x4567 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
+      i3 += 8;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      const __m128 vk3x4567 = _mm_load_ps(w + 36);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi3x4567, vk3x4567));
+
+      w += 40;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+      vacc4567p0 = _mm_add_ps(vacc4567p0, vacc4567p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+      vacc4567 = _mm_min_ps(vacc4567, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      _mm_storeu_ps(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      w += 4;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x4-sse.c b/src/f32-dwconv/up8x4-sse.c
new file mode 100644
index 0000000..9e0e678
--- /dev/null
+++ b/src/f32-dwconv/up8x4-sse.c
@@ -0,0 +1,166 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x4__sse(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      __m128 vacc4567p0 = _mm_load_ps(w + 4);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
+      i0 += 8;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      const __m128 vk0x4567 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
+      i1 += 8;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      const __m128 vk1x4567 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
+      i2 += 8;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      const __m128 vk2x4567 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
+      i3 += 8;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      const __m128 vk3x4567 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
+
+      w += 40;
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+      vacc4567 = _mm_min_ps(vacc4567, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      _mm_storeu_ps(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      w += 4;
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-neon-acc2.c b/src/f32-dwconv/up8x9-neon-acc2.c
new file mode 100644
index 0000000..7d43a53
--- /dev/null
+++ b/src/f32-dwconv/up8x9-neon-acc2.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__neon_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+  assert(output_width != 0);
+
+  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
+  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+      float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+      vacc4567p1 = vmlaq_f32(vacc4567p1, vi3x4567, vk3x4567);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+      vacc4567p1 = vmlaq_f32(vacc4567p1, vi5x4567, vk5x4567);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+      vacc4567p1 = vmlaq_f32(vacc4567p1, vi7x4567, vk7x4567);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+      vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+      vacc4567 = vminq_f32(vacc4567, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+      vst1q_f32(output, vacc4567); output += 4;
+    }
+    for (; c >= 4; c -= 4) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w + 36);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w + 44);
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w + 52);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w + 60);
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w + 68);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float32x4_t vacc0123p0 = vld1q_f32(w);
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
+      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
+      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
+      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
+      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
+      const float32x4_t vk4x0123 = vld1q_f32(w + 40);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
+      const float32x4_t vk5x0123 = vld1q_f32(w + 48);
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
+      const float32x4_t vk6x0123 = vld1q_f32(w + 56);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
+      const float32x4_t vk7x0123 = vld1q_f32(w + 64);
+      vacc0123p1 = vmlaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
+      const float32x4_t vk8x0123 = vld1q_f32(w + 72);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      float32x2_t vacc01 = vget_low_f32(vacc0123);
+      if (c & 2) {
+        vst1_f32(output, vacc01); output += 2;
+        vacc01 = vget_high_f32(vacc0123);
+      }
+      if (c & 1) {
+        vst1_lane_f32(output, vacc01, 0); output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-neon.c b/src/f32-dwconv/up8x9-neon.c
new file mode 100644
index 0000000..6b91158
--- /dev/null
+++ b/src/f32-dwconv/up8x9-neon.c
@@ -0,0 +1,226 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__neon(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+  assert(output_width != 0);
+
+  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
+  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi1x4567, vk1x4567);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi3x4567, vk3x4567);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi5x4567, vk5x4567);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi7x4567, vk7x4567);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = vmlaq_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+      vacc4567 = vminq_f32(vacc4567, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+      vst1q_f32(output, vacc4567); output += 4;
+    }
+    for (; c >= 4; c -= 4) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w + 36);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w + 44);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w + 52);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w + 60);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w + 68);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float32x4_t vacc0123p0 = vld1q_f32(w);
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
+      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
+      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
+      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
+      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
+      const float32x4_t vk4x0123 = vld1q_f32(w + 40);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
+      const float32x4_t vk5x0123 = vld1q_f32(w + 48);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
+      const float32x4_t vk6x0123 = vld1q_f32(w + 56);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
+      const float32x4_t vk7x0123 = vld1q_f32(w + 64);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
+      const float32x4_t vk8x0123 = vld1q_f32(w + 72);
+      vacc0123p0 = vmlaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      float32x2_t vacc01 = vget_low_f32(vacc0123);
+      if (c & 2) {
+        vst1_f32(output, vacc01); output += 2;
+        vacc01 = vget_high_f32(vacc0123);
+      }
+      if (c & 1) {
+        vst1_lane_f32(output, vacc01, 0); output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-neonfma-acc2.c b/src/f32-dwconv/up8x9-neonfma-acc2.c
new file mode 100644
index 0000000..fc902db
--- /dev/null
+++ b/src/f32-dwconv/up8x9-neonfma-acc2.c
@@ -0,0 +1,233 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-neon.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+  assert(output_width != 0);
+
+  const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
+  const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+      float32x4_t vacc4567p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = vfmaq_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+      float32x4_t vacc4567p1 = vmulq_f32(vi1x4567, vk1x4567);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = vfmaq_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+      vacc4567p1 = vfmaq_f32(vacc4567p1, vi3x4567, vk3x4567);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = vfmaq_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+      vacc4567p1 = vfmaq_f32(vacc4567p1, vi5x4567, vk5x4567);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = vfmaq_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+      vacc4567p1 = vfmaq_f32(vacc4567p1, vi7x4567, vk7x4567);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
+      const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = vfmaq_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+      vacc4567p0 = vaddq_f32(vacc4567p0, vacc4567p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      float32x4_t vacc4567 = vmaxq_f32(vacc4567p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+      vacc4567 = vminq_f32(vacc4567, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+      vst1q_f32(output, vacc4567); output += 4;
+    }
+    for (; c >= 4; c -= 4) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
+      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
+      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
+      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
+      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
+      const float32x4_t vk4x0123 = vld1q_f32(w + 36);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
+      const float32x4_t vk5x0123 = vld1q_f32(w + 44);
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
+      const float32x4_t vk6x0123 = vld1q_f32(w + 52);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
+      const float32x4_t vk7x0123 = vld1q_f32(w + 60);
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
+      const float32x4_t vk8x0123 = vld1q_f32(w + 68);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      vst1q_f32(output, vacc0123); output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float32x4_t vacc0123p0 = vld1q_f32(w);
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
+      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
+      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
+      float32x4_t vacc0123p1 = vmulq_f32(vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
+      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
+      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
+      const float32x4_t vk4x0123 = vld1q_f32(w + 40);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
+      const float32x4_t vk5x0123 = vld1q_f32(w + 48);
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
+      const float32x4_t vk6x0123 = vld1q_f32(w + 56);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
+      const float32x4_t vk7x0123 = vld1q_f32(w + 64);
+      vacc0123p1 = vfmaq_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
+      const float32x4_t vk8x0123 = vld1q_f32(w + 72);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc0123p0
+      vacc0123p0 = vaddq_f32(vacc0123p0, vacc0123p1);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
+      float32x2_t vacc01 = vget_low_f32(vacc0123);
+      if (c & 2) {
+        vst1_f32(output, vacc01); output += 2;
+        vacc01 = vget_high_f32(vacc0123);
+      }
+      if (c & 1) {
+        vst1_lane_f32(output, vacc01, 0); output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-neonfma.c b/src/f32-dwconv/up8x9-neonfma.c
index bf2bbbb..f6b358a 100644
--- a/src/f32-dwconv/up8x9-neonfma.c
+++ b/src/f32-dwconv/up8x9-neonfma.c
@@ -25,6 +25,7 @@
     const union xnn_f32_output_params params[restrict static 1])
 {
   assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
   assert(output_width != 0);
 
   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
@@ -120,83 +121,96 @@
       vst1q_f32(output, vacc0123); output += 4;
       vst1q_f32(output, vacc4567); output += 4;
     }
-    if XNN_UNLIKELY(c != 0) {
-      float32x4_t vacc0123 = vld1q_f32(w); w += 4;
-      float32x4_t vacc4567 = vld1q_f32(w); w += 4;
+    for (; c >= 4; c -= 4) {
+      float32x4_t vacc0123p0 = vld1q_f32(w); w += 4;
 
 
       const float32x4_t vi0x0123 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
-      const float32x4_t vk0x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk0x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi0x0123, vk0x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi0x4567, vk0x4567);
+      const float32x4_t vk0x0123 = vld1q_f32(w + 4);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
 
       const float32x4_t vi1x0123 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
-      const float32x4_t vk1x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk1x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi1x0123, vk1x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi1x4567, vk1x4567);
+      const float32x4_t vk1x0123 = vld1q_f32(w + 12);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
 
       const float32x4_t vi2x0123 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
-      const float32x4_t vk2x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk2x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi2x0123, vk2x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi2x4567, vk2x4567);
+      const float32x4_t vk2x0123 = vld1q_f32(w + 20);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
 
       const float32x4_t vi3x0123 = vld1q_f32(i3); i3 += 4;
-      const float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
-      const float32x4_t vk3x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk3x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi3x0123, vk3x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi3x4567, vk3x4567);
+      const float32x4_t vk3x0123 = vld1q_f32(w + 28);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
 
       const float32x4_t vi4x0123 = vld1q_f32(i4); i4 += 4;
-      const float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
-      const float32x4_t vk4x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk4x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi4x0123, vk4x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi4x4567, vk4x4567);
+      const float32x4_t vk4x0123 = vld1q_f32(w + 36);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
 
       const float32x4_t vi5x0123 = vld1q_f32(i5); i5 += 4;
-      const float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
-      const float32x4_t vk5x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk5x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi5x0123, vk5x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi5x4567, vk5x4567);
+      const float32x4_t vk5x0123 = vld1q_f32(w + 44);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123);
 
       const float32x4_t vi6x0123 = vld1q_f32(i6); i6 += 4;
-      const float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
-      const float32x4_t vk6x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk6x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi6x0123, vk6x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi6x4567, vk6x4567);
+      const float32x4_t vk6x0123 = vld1q_f32(w + 52);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
 
       const float32x4_t vi7x0123 = vld1q_f32(i7); i7 += 4;
-      const float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
-      const float32x4_t vk7x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk7x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi7x0123, vk7x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi7x4567, vk7x4567);
+      const float32x4_t vk7x0123 = vld1q_f32(w + 60);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123);
 
       const float32x4_t vi8x0123 = vld1q_f32(i8); i8 += 4;
-      const float32x4_t vi8x4567 = vld1q_f32(i8); i8 += 4;
-      const float32x4_t vk8x0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vk8x4567 = vld1q_f32(w); w += 4;
-      vacc0123 = vfmaq_f32(vacc0123, vi8x0123, vk8x0123);
-      vacc4567 = vfmaq_f32(vacc4567, vi8x4567, vk8x4567);
+      const float32x4_t vk8x0123 = vld1q_f32(w + 68);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
 
-      vacc0123 = vmaxq_f32(vacc0123, vmin);
-      vacc4567 = vmaxq_f32(vacc4567, vmin);
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
       vacc0123 = vminq_f32(vacc0123, vmax);
-      vacc4567 = vminq_f32(vacc4567, vmax);
 
-      if (c & 4) {
-        vst1q_f32(output, vacc0123); output += 4;
-        vacc0123 = vacc4567;
-      }
+      vst1q_f32(output, vacc0123); output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      float32x4_t vacc0123p0 = vld1q_f32(w);
+
+
+      const float32x4_t vi0x0123 = vld1q_f32(i0);
+      const float32x4_t vk0x0123 = vld1q_f32(w + 8);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const float32x4_t vi1x0123 = vld1q_f32(i1);
+      const float32x4_t vk1x0123 = vld1q_f32(w + 16);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const float32x4_t vi2x0123 = vld1q_f32(i2);
+      const float32x4_t vk2x0123 = vld1q_f32(w + 24);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const float32x4_t vi3x0123 = vld1q_f32(i3);
+      const float32x4_t vk3x0123 = vld1q_f32(w + 32);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const float32x4_t vi4x0123 = vld1q_f32(i4);
+      const float32x4_t vk4x0123 = vld1q_f32(w + 40);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const float32x4_t vi5x0123 = vld1q_f32(i5);
+      const float32x4_t vk5x0123 = vld1q_f32(w + 48);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const float32x4_t vi6x0123 = vld1q_f32(i6);
+      const float32x4_t vk6x0123 = vld1q_f32(w + 56);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const float32x4_t vi7x0123 = vld1q_f32(i7);
+      const float32x4_t vk7x0123 = vld1q_f32(w + 64);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const float32x4_t vi8x0123 = vld1q_f32(i8);
+      const float32x4_t vk8x0123 = vld1q_f32(w + 72);
+      vacc0123p0 = vfmaq_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+
+      float32x4_t vacc0123 = vmaxq_f32(vacc0123p0, vmin);
+      vacc0123 = vminq_f32(vacc0123, vmax);
+
       float32x2_t vacc01 = vget_low_f32(vacc0123);
       if (c & 2) {
         vst1_f32(output, vacc01); output += 2;
diff --git a/src/f32-dwconv/up8x9-psimd-acc2.c b/src/f32-dwconv/up8x9-psimd-acc2.c
new file mode 100644
index 0000000..67f42c9
--- /dev/null
+++ b/src/f32-dwconv/up8x9-psimd-acc2.c
@@ -0,0 +1,273 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__psimd_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
+      i0 += 8;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
+      i1 += 8;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+      psimd_f32 vacc4567p1 = psimd_mul_f32(vi1x4567, vk1x4567);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
+      i2 += 8;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
+      i3 += 8;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi3x4567, vk3x4567);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vi4x4567 = psimd_load_f32(i4 + 4);
+      i4 += 8;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      const psimd_f32 vk4x4567 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vi5x4567 = psimd_load_f32(i5 + 4);
+      i5 += 8;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      const psimd_f32 vk5x4567 = psimd_load_f32(w + 52);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi5x4567, vk5x4567);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vi6x4567 = psimd_load_f32(i6 + 4);
+      i6 += 8;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      const psimd_f32 vk6x4567 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vi7x4567 = psimd_load_f32(i7 + 4);
+      i7 += 8;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      const psimd_f32 vk7x4567 = psimd_load_f32(w + 68);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+      vacc4567p1 = psimd_qfma_f32(vacc4567p1, vi7x4567, vk7x4567);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vi8x4567 = psimd_load_f32(i8 + 4);
+      i8 += 8;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      const psimd_f32 vk8x4567 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+      w += 80;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+      vacc4567p0 = psimd_add_f32(vacc4567p0, vacc4567p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+      vacc4567 = psimd_min_f32(vacc4567, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      psimd_store_f32(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      i4 += 4;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      i5 += 4;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      i6 += 4;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      i7 += 4;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      i8 += 4;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      w += 4;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      psimd_f32 vacc0123p1 = psimd_mul_f32(vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p1 = psimd_qfma_f32(vacc0123p1, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = psimd_add_f32(vacc0123p0, vacc0123p1);
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-psimd.c b/src/f32-dwconv/up8x9-psimd.c
new file mode 100644
index 0000000..2cf6c47
--- /dev/null
+++ b/src/f32-dwconv/up8x9-psimd.c
@@ -0,0 +1,266 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-psimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <psimd.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__psimd(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const psimd_f32 vmax = psimd_load_splat_f32(&params->scalar.max);
+  const psimd_f32 vmin = psimd_load_splat_f32(&params->scalar.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+      psimd_f32 vacc4567p0 = psimd_load_f32(w + 4);
+
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vi0x4567 = psimd_load_f32(i0 + 4);
+      i0 += 8;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      const psimd_f32 vk0x4567 = psimd_load_f32(w + 12);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi0x4567, vk0x4567);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vi1x4567 = psimd_load_f32(i1 + 4);
+      i1 += 8;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      const psimd_f32 vk1x4567 = psimd_load_f32(w + 20);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi1x4567, vk1x4567);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vi2x4567 = psimd_load_f32(i2 + 4);
+      i2 += 8;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      const psimd_f32 vk2x4567 = psimd_load_f32(w + 28);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi2x4567, vk2x4567);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vi3x4567 = psimd_load_f32(i3 + 4);
+      i3 += 8;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      const psimd_f32 vk3x4567 = psimd_load_f32(w + 36);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi3x4567, vk3x4567);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vi4x4567 = psimd_load_f32(i4 + 4);
+      i4 += 8;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      const psimd_f32 vk4x4567 = psimd_load_f32(w + 44);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi4x4567, vk4x4567);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vi5x4567 = psimd_load_f32(i5 + 4);
+      i5 += 8;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      const psimd_f32 vk5x4567 = psimd_load_f32(w + 52);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi5x4567, vk5x4567);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vi6x4567 = psimd_load_f32(i6 + 4);
+      i6 += 8;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      const psimd_f32 vk6x4567 = psimd_load_f32(w + 60);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi6x4567, vk6x4567);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vi7x4567 = psimd_load_f32(i7 + 4);
+      i7 += 8;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      const psimd_f32 vk7x4567 = psimd_load_f32(w + 68);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi7x4567, vk7x4567);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vi8x4567 = psimd_load_f32(i8 + 4);
+      i8 += 8;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      const psimd_f32 vk8x4567 = psimd_load_f32(w + 76);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+      vacc4567p0 = psimd_qfma_f32(vacc4567p0, vi8x4567, vk8x4567);
+
+      w += 80;
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      psimd_f32 vacc4567 = psimd_max_f32(vacc4567p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+      vacc4567 = psimd_min_f32(vacc4567, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      psimd_store_f32(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      i0 += 4;
+
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      i1 += 4;
+
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      i2 += 4;
+
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      i3 += 4;
+
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      i4 += 4;
+
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      i5 += 4;
+
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      i6 += 4;
+
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      i7 += 4;
+
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      i8 += 4;
+
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+      w += 4;
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      psimd_store_f32(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      psimd_f32 vacc0123p0 = psimd_load_f32(w);
+
+      const psimd_f32 vi0x0123 = psimd_load_f32(i0);
+      const psimd_f32 vk0x0123 = psimd_load_f32(w + 8);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi0x0123, vk0x0123);
+
+      const psimd_f32 vi1x0123 = psimd_load_f32(i1);
+      const psimd_f32 vk1x0123 = psimd_load_f32(w + 16);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi1x0123, vk1x0123);
+
+      const psimd_f32 vi2x0123 = psimd_load_f32(i2);
+      const psimd_f32 vk2x0123 = psimd_load_f32(w + 24);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi2x0123, vk2x0123);
+
+      const psimd_f32 vi3x0123 = psimd_load_f32(i3);
+      const psimd_f32 vk3x0123 = psimd_load_f32(w + 32);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi3x0123, vk3x0123);
+
+      const psimd_f32 vi4x0123 = psimd_load_f32(i4);
+      const psimd_f32 vk4x0123 = psimd_load_f32(w + 40);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi4x0123, vk4x0123);
+
+      const psimd_f32 vi5x0123 = psimd_load_f32(i5);
+      const psimd_f32 vk5x0123 = psimd_load_f32(w + 48);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi5x0123, vk5x0123);
+
+      const psimd_f32 vi6x0123 = psimd_load_f32(i6);
+      const psimd_f32 vk6x0123 = psimd_load_f32(w + 56);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi6x0123, vk6x0123);
+
+      const psimd_f32 vi7x0123 = psimd_load_f32(i7);
+      const psimd_f32 vk7x0123 = psimd_load_f32(w + 64);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi7x0123, vk7x0123);
+
+      const psimd_f32 vi8x0123 = psimd_load_f32(i8);
+      const psimd_f32 vk8x0123 = psimd_load_f32(w + 72);
+      vacc0123p0 = psimd_qfma_f32(vacc0123p0, vi8x0123, vk8x0123);
+
+
+      psimd_f32 vacc0123 = psimd_max_f32(vacc0123p0, vmin);
+      vacc0123 = psimd_min_f32(vacc0123, vmax);
+
+      if (c & 2) {
+        psimd_store2_f32(output, vacc0123);
+        vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        psimd_store1_f32(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-sse-acc2.c b/src/f32-dwconv/up8x9-sse-acc2.c
new file mode 100644
index 0000000..d55d8ba
--- /dev/null
+++ b/src/f32-dwconv/up8x9-sse-acc2.c
@@ -0,0 +1,273 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__sse_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      __m128 vacc4567p0 = _mm_load_ps(w + 4);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
+      i0 += 8;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      const __m128 vk0x4567 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
+      i1 += 8;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      const __m128 vk1x4567 = _mm_load_ps(w + 20);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+      __m128 vacc4567p1 = _mm_mul_ps(vi1x4567, vk1x4567);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
+      i2 += 8;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      const __m128 vk2x4567 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
+      i3 += 8;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      const __m128 vk3x4567 = _mm_load_ps(w + 36);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi3x4567, vk3x4567));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
+      i4 += 8;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      const __m128 vk4x4567 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
+      i5 += 8;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      const __m128 vk5x4567 = _mm_load_ps(w + 52);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi5x4567, vk5x4567));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
+      i6 += 8;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      const __m128 vk6x4567 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
+      i7 += 8;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      const __m128 vk7x4567 = _mm_load_ps(w + 68);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+      vacc4567p1 = _mm_add_ps(vacc4567p1, _mm_mul_ps(vi7x4567, vk7x4567));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
+      i8 += 8;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      const __m128 vk8x4567 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
+
+      w += 80;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+      vacc4567p0 = _mm_add_ps(vacc4567p0, vacc4567p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+      vacc4567 = _mm_min_ps(vacc4567, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      _mm_storeu_ps(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      i4 += 4;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      i5 += 4;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      i6 += 4;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      i7 += 4;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      i8 += 4;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      w += 4;
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      __m128 vacc0123p1 = _mm_mul_ps(vi1x0123, vk1x0123);
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p1 = _mm_add_ps(vacc0123p1, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      // Add up all accumulators to vacc01234567p0
+      vacc0123p0 = _mm_add_ps(vacc0123p0, vacc0123p1);
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/f32-dwconv/up8x9-sse.c b/src/f32-dwconv/up8x9-sse.c
new file mode 100644
index 0000000..52ec2ec
--- /dev/null
+++ b/src/f32-dwconv/up8x9-sse.c
@@ -0,0 +1,266 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-sse.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xmmintrin.h>
+
+#include <xnnpack/dwconv.h>
+
+
+void xnn_f32_dwconv_ukernel_up8x9__sse(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const __m128 vmax = _mm_load_ps(params->sse.max);
+  const __m128 vmin = _mm_load_ps(params->sse.min);
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 8; c -= 8) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+      __m128 vacc4567p0 = _mm_load_ps(w + 4);
+
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vi0x4567 = _mm_loadu_ps(i0 + 4);
+      i0 += 8;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      const __m128 vk0x4567 = _mm_load_ps(w + 12);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi0x4567, vk0x4567));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vi1x4567 = _mm_loadu_ps(i1 + 4);
+      i1 += 8;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      const __m128 vk1x4567 = _mm_load_ps(w + 20);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi1x4567, vk1x4567));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vi2x4567 = _mm_loadu_ps(i2 + 4);
+      i2 += 8;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      const __m128 vk2x4567 = _mm_load_ps(w + 28);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi2x4567, vk2x4567));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vi3x4567 = _mm_loadu_ps(i3 + 4);
+      i3 += 8;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      const __m128 vk3x4567 = _mm_load_ps(w + 36);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi3x4567, vk3x4567));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vi4x4567 = _mm_loadu_ps(i4 + 4);
+      i4 += 8;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      const __m128 vk4x4567 = _mm_load_ps(w + 44);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi4x4567, vk4x4567));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vi5x4567 = _mm_loadu_ps(i5 + 4);
+      i5 += 8;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      const __m128 vk5x4567 = _mm_load_ps(w + 52);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi5x4567, vk5x4567));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vi6x4567 = _mm_loadu_ps(i6 + 4);
+      i6 += 8;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      const __m128 vk6x4567 = _mm_load_ps(w + 60);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi6x4567, vk6x4567));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vi7x4567 = _mm_loadu_ps(i7 + 4);
+      i7 += 8;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      const __m128 vk7x4567 = _mm_load_ps(w + 68);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi7x4567, vk7x4567));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vi8x4567 = _mm_loadu_ps(i8 + 4);
+      i8 += 8;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      const __m128 vk8x4567 = _mm_load_ps(w + 76);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+      vacc4567p0 = _mm_add_ps(vacc4567p0, _mm_mul_ps(vi8x4567, vk8x4567));
+
+      w += 80;
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      __m128 vacc4567 = _mm_max_ps(vacc4567p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+      vacc4567 = _mm_min_ps(vacc4567, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      _mm_storeu_ps(output + 4, vacc4567);
+      output += 8;
+    }
+    for (; c >= 4; c -= 4) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      i0 += 4;
+
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      i1 += 4;
+
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      i2 += 4;
+
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      i3 += 4;
+
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      i4 += 4;
+
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      i5 += 4;
+
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      i6 += 4;
+
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      i7 += 4;
+
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      i8 += 4;
+
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+      w += 4;
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      _mm_storeu_ps(output, vacc0123);
+      output += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      __m128 vacc0123p0 = _mm_load_ps(w);
+
+      const __m128 vi0x0123 = _mm_loadu_ps(i0);
+      const __m128 vk0x0123 = _mm_load_ps(w + 8);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi0x0123, vk0x0123));
+
+      const __m128 vi1x0123 = _mm_loadu_ps(i1);
+      const __m128 vk1x0123 = _mm_load_ps(w + 16);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi1x0123, vk1x0123));
+
+      const __m128 vi2x0123 = _mm_loadu_ps(i2);
+      const __m128 vk2x0123 = _mm_load_ps(w + 24);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi2x0123, vk2x0123));
+
+      const __m128 vi3x0123 = _mm_loadu_ps(i3);
+      const __m128 vk3x0123 = _mm_load_ps(w + 32);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi3x0123, vk3x0123));
+
+      const __m128 vi4x0123 = _mm_loadu_ps(i4);
+      const __m128 vk4x0123 = _mm_load_ps(w + 40);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi4x0123, vk4x0123));
+
+      const __m128 vi5x0123 = _mm_loadu_ps(i5);
+      const __m128 vk5x0123 = _mm_load_ps(w + 48);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi5x0123, vk5x0123));
+
+      const __m128 vi6x0123 = _mm_loadu_ps(i6);
+      const __m128 vk6x0123 = _mm_load_ps(w + 56);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi6x0123, vk6x0123));
+
+      const __m128 vi7x0123 = _mm_loadu_ps(i7);
+      const __m128 vk7x0123 = _mm_load_ps(w + 64);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi7x0123, vk7x0123));
+
+      const __m128 vi8x0123 = _mm_loadu_ps(i8);
+      const __m128 vk8x0123 = _mm_load_ps(w + 72);
+      vacc0123p0 = _mm_add_ps(vacc0123p0, _mm_mul_ps(vi8x0123, vk8x0123));
+
+
+      __m128 vacc0123 = _mm_max_ps(vacc0123p0, vmin);
+      vacc0123 = _mm_min_ps(vacc0123, vmax);
+
+      if (c & 2) {
+        _mm_storel_pi((__m64*) output, vacc0123);
+        vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
+        output += 2;
+      }
+      if (c & 1) {
+        _mm_store_ss(output, vacc0123);
+        output += 1;
+      }
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}
diff --git a/src/init.c b/src/init.c
index ba9c716..86d43b9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -586,18 +586,18 @@
       .log2_kr = 2,
     };
     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__sse,
-      .cr = 4,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__sse,
+      .cr = 8,
       .mr = 4,
     };
     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__sse,
-      .cr = 4,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__sse,
+      .cr = 8,
       .mr = 9,
     };
     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__sse,
-      .cr = 4,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x25__sse,
+      .cr = 8,
       .mr = 25,
     };
     xnn_params.f32.avgpool = (struct avgpool_parameters) {
@@ -776,17 +776,17 @@
       .log2_kr = 2,
     };
     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__psimd,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__psimd_acc2,
       .cr = 4,
       .mr = 4,
     };
     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__psimd,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__psimd_acc2,
       .cr = 4,
       .mr = 9,
     };
     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__psimd,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__psimd_acc2,
       .cr = 4,
       .mr = 25,
     };
@@ -941,17 +941,17 @@
       .nr = 2,
     };
     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2,
       .cr = 1,
       .mr = 4,
     };
     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2,
       .cr = 1,
       .mr = 9,
     };
     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2,
       .cr = 1,
       .mr = 25,
     };
diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index 6d8f79a..440a756 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h
@@ -29,20 +29,54 @@
     size_t output_increment,                                 \
     const union xnn_f32_output_params* params);
 
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x4__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__sse_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x4__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x4__sse_acc2)
+
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neon)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neon_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neonfma)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__sse)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neonfma)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__sse_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__sse_acc2)
+
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x25__psimd)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__sse_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x25__sse)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x25__sse_acc2)
 
 
 #define DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \
diff --git a/test/f32-dwconv.cc b/test/f32-dwconv.cc
index 626ba01..ad9ffab 100644
--- a/test/f32-dwconv.cc
+++ b/test/f32-dwconv.cc
@@ -507,6 +507,483 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_eq_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(9)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_div_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_lt_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_gt_4) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEONFMA_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_div_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEONFMA_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(F32_DWCONV_UP4X9__NEON, c_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     DWConvMicrokernelTester()
@@ -665,6 +1142,483 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_eq_4) {
+    TEST_REQUIRES_ARM_NEON;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(9)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_div_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_lt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_gt_4) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__NEON_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__neon_acc2);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_DWCONV_UP8X9__NEON, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, multipixel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, multipixel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__NEON_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__neon_acc2);
+    }
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_UP4X25__SSE, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
@@ -825,6 +1779,483 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_eq_4) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(25)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_div_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_lt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_gt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__SSE_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__sse_acc2);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP8X25__SSE, c_eq_8) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_div_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_lt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_gt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_eq_8) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_div_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_lt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_gt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__SSE_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__sse_acc2);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_UP4X9__SSE, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvMicrokernelTester()
@@ -984,6 +2415,483 @@
 
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_eq_4) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(9)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_div_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_lt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_gt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__SSE_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__sse_acc2);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP8X9__SSE, c_eq_8) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_div_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_lt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_gt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_eq_8) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_div_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_lt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_gt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__SSE_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__sse_acc2);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
   TEST(F32_DWCONV_UP4X4__SSE, c_eq_4) {
     TEST_REQUIRES_X86_SSE;
     DWConvMicrokernelTester()
@@ -1142,6 +3050,483 @@
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
 
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_eq_4) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(4)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_div_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_lt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_gt_4) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__SSE_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__sse_acc2);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP8X4__SSE, c_eq_8) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(4)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_div_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_lt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_gt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_eq_8) {
+    TEST_REQUIRES_X86_SSE;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(4)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_div_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_lt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_gt_8) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, multipixel) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__SSE_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_X86_SSE;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__sse_acc2);
+    }
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
 #if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_DWCONV_UP4X25__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
@@ -1302,6 +3687,483 @@
 
 
 #if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_eq_4) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(25)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_div_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_lt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_gt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X25__PSIMD_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_eq_8) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_div_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_div_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_div_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_lt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_gt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_gt_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, c_gt_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_eq_8) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(25)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_div_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_lt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_gt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 25; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(25)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X25__PSIMD_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x25__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_DWCONV_UP4X9__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     DWConvMicrokernelTester()
@@ -1461,6 +4323,483 @@
 
 
 #if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_eq_4) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(9)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_div_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_lt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_gt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X9__PSIMD_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_eq_8) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_div_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_div_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_div_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_lt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_gt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_gt_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, c_gt_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_eq_8) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(9)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_div_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_lt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_gt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 9; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(9)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X9__PSIMD_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x9__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
   TEST(F32_DWCONV_UP4X4__PSIMD, c_eq_4) {
     TEST_REQUIRES_PSIMD;
     DWConvMicrokernelTester()
@@ -1619,6 +4958,483 @@
 #endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
 
 
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_eq_4) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(4)
+      .kr(4)
+      .channels(4)
+      .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_div_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_div_4_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_div_4_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 8; channels < 64; channels += 12) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_lt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 4; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_gt_4) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_gt_4_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, c_gt_4_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 5; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(4)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(4)
+        .width(5)
+        .output_stride(23)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP4X4__PSIMD_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 20; channels += 3) {
+      DWConvMicrokernelTester()
+        .cr(4)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_eq_8) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(4)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_div_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_div_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_div_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_lt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_gt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_gt_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, c_gt_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
+#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_eq_8) {
+    TEST_REQUIRES_PSIMD;
+    DWConvMicrokernelTester()
+      .cr(8)
+      .kr(4)
+      .channels(8)
+      .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_div_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_div_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_div_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 16; channels < 128; channels += 24) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_lt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 1; channels < 8; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_gt_8) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_gt_8_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, c_gt_8_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (uint32_t channels = 9; channels < 16; channels++) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, multipixel) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, multipixel_with_step) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      for (size_t step = 2; step <= 4; step++) {
+        DWConvMicrokernelTester()
+          .cr(8)
+          .kr(4)
+          .channels(channels)
+          .width(3)
+          .step(step)
+          .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+      }
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, multipixel_with_output_stride) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(8)
+        .width(5)
+        .output_stride(43)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, multipixel_with_qmin) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmin(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+
+  TEST(F32_DWCONV_UP8X4__PSIMD_ACC2, multipixel_with_qmax) {
+    TEST_REQUIRES_PSIMD;
+    for (size_t channels = 1; channels <= 40; channels += 7) {
+      DWConvMicrokernelTester()
+        .cr(8)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .qmax(128)
+        .Test(xnn_f32_dwconv_ukernel_up8x4__psimd_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+#endif  // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
+
+
 TEST(F32_DWCONV_UP1X4__SCALAR, c_eq_1) {
   DWConvMicrokernelTester()
     .cr(1)
@@ -1721,6 +5537,396 @@
 }
 
 
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, c_eq_1) {
+  DWConvMicrokernelTester()
+    .cr(1)
+    .kr(4)
+    .channels(1)
+    .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, c_gt_1) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, c_gt_1_with_qmin) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, c_gt_1_with_qmax) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, multipixel) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    for (size_t step = 2; step <= 4; step++) {
+      DWConvMicrokernelTester()
+        .cr(1)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(1)
+      .width(5)
+      .output_stride(7)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X4__SCALAR_ACC2, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_eq_2) {
+  DWConvMicrokernelTester()
+    .cr(2)
+    .kr(4)
+    .channels(2)
+    .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_div_2) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_div_2_with_qmin) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_div_2_with_qmax) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_lt_2) {
+  for (uint32_t channels = 1; channels < 2; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_gt_2) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_gt_2_with_qmin) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, c_gt_2_with_qmax) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, multipixel) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    for (size_t step = 2; step <= 4; step++) {
+      DWConvMicrokernelTester()
+        .cr(2)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(2)
+      .width(5)
+      .output_stride(13)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_eq_2) {
+  DWConvMicrokernelTester()
+    .cr(2)
+    .kr(4)
+    .channels(2)
+    .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_div_2) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_div_2_with_qmin) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_div_2_with_qmax) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_lt_2) {
+  for (uint32_t channels = 1; channels < 2; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_gt_2) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_gt_2_with_qmin) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, c_gt_2_with_qmax) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, multipixel) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    for (size_t step = 2; step <= 4; step++) {
+      DWConvMicrokernelTester()
+        .cr(2)
+        .kr(4)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(2)
+      .width(5)
+      .output_stride(13)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X4__SCALAR_ACC2, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(4)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
 TEST(F32_DWCONV_UP1X9__SCALAR, c_eq_1) {
   DWConvMicrokernelTester()
     .cr(1)
@@ -1823,6 +6029,396 @@
 }
 
 
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, c_eq_1) {
+  DWConvMicrokernelTester()
+    .cr(1)
+    .kr(9)
+    .channels(1)
+    .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, c_gt_1) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, c_gt_1_with_qmin) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, c_gt_1_with_qmax) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, multipixel) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    for (size_t step = 2; step <= 9; step++) {
+      DWConvMicrokernelTester()
+        .cr(1)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(1)
+      .width(5)
+      .output_stride(7)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X9__SCALAR_ACC2, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_eq_2) {
+  DWConvMicrokernelTester()
+    .cr(2)
+    .kr(9)
+    .channels(2)
+    .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_div_2) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_div_2_with_qmin) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_div_2_with_qmax) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_lt_2) {
+  for (uint32_t channels = 1; channels < 2; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_gt_2) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_gt_2_with_qmin) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, c_gt_2_with_qmax) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, multipixel) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    for (size_t step = 2; step <= 9; step++) {
+      DWConvMicrokernelTester()
+        .cr(2)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(2)
+      .width(5)
+      .output_stride(13)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_eq_2) {
+  DWConvMicrokernelTester()
+    .cr(2)
+    .kr(9)
+    .channels(2)
+    .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_div_2) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_div_2_with_qmin) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_div_2_with_qmax) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_lt_2) {
+  for (uint32_t channels = 1; channels < 2; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_gt_2) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_gt_2_with_qmin) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, c_gt_2_with_qmax) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, multipixel) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    for (size_t step = 2; step <= 9; step++) {
+      DWConvMicrokernelTester()
+        .cr(2)
+        .kr(9)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(2)
+      .width(5)
+      .output_stride(13)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X9__SCALAR_ACC2, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(9)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
 TEST(F32_DWCONV_UP1X25__SCALAR, c_eq_1) {
   DWConvMicrokernelTester()
     .cr(1)
@@ -1923,3 +6519,393 @@
       .Test(xnn_f32_dwconv_ukernel_up1x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, c_eq_1) {
+  DWConvMicrokernelTester()
+    .cr(1)
+    .kr(25)
+    .channels(1)
+    .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, c_gt_1) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, c_gt_1_with_qmin) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, c_gt_1_with_qmax) {
+  for (uint32_t channels = 2; channels < 10; channels++) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, multipixel) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    for (size_t step = 2; step <= 25; step++) {
+      DWConvMicrokernelTester()
+        .cr(1)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(1)
+      .width(5)
+      .output_stride(7)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP1X25__SCALAR_ACC2, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 5; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(1)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_eq_2) {
+  DWConvMicrokernelTester()
+    .cr(2)
+    .kr(25)
+    .channels(2)
+    .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_div_2) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_div_2_with_qmin) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_div_2_with_qmax) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_lt_2) {
+  for (uint32_t channels = 1; channels < 2; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_gt_2) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_gt_2_with_qmin) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, c_gt_2_with_qmax) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, multipixel) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    for (size_t step = 2; step <= 25; step++) {
+      DWConvMicrokernelTester()
+        .cr(2)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(2)
+      .width(5)
+      .output_stride(13)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_eq_2) {
+  DWConvMicrokernelTester()
+    .cr(2)
+    .kr(25)
+    .channels(2)
+    .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_div_2) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_div_2_with_qmin) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_div_2_with_qmax) {
+  for (uint32_t channels = 4; channels < 32; channels += 6) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_lt_2) {
+  for (uint32_t channels = 1; channels < 2; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_gt_2) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_gt_2_with_qmin) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, c_gt_2_with_qmax) {
+  for (uint32_t channels = 3; channels < 4; channels++) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, multipixel) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, multipixel_with_step) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    for (size_t step = 2; step <= 25; step++) {
+      DWConvMicrokernelTester()
+        .cr(2)
+        .kr(25)
+        .channels(channels)
+        .width(3)
+        .step(step)
+        .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, multipixel_with_output_stride) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(2)
+      .width(5)
+      .output_stride(13)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, multipixel_with_qmin) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .qmin(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_DWCONV_UP2X25__SCALAR_ACC2, multipixel_with_qmax) {
+  for (size_t channels = 1; channels <= 10; channels += 1) {
+    DWConvMicrokernelTester()
+      .cr(2)
+      .kr(25)
+      .channels(channels)
+      .width(3)
+      .qmax(128)
+      .Test(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2, DWConvMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-dwconv.yaml b/test/f32-dwconv.yaml
index 94bb715..c13b647 100644
--- a/test/f32-dwconv.yaml
+++ b/test/f32-dwconv.yaml
@@ -7,13 +7,46 @@
   pipelined: true
   assembly: true
 - name: xnn_f32_dwconv_ukernel_up4x9__neonfma
+- name: xnn_f32_dwconv_ukernel_up4x9__neonfma_acc2
+- name: xnn_f32_dwconv_ukernel_up8x9__neonfma
+- name: xnn_f32_dwconv_ukernel_up8x9__neonfma_acc2
 - name: xnn_f32_dwconv_ukernel_up4x9__neon
+- name: xnn_f32_dwconv_ukernel_up4x9__neon_acc2
+- name: xnn_f32_dwconv_ukernel_up8x9__neon
+- name: xnn_f32_dwconv_ukernel_up8x9__neon_acc2
 - name: xnn_f32_dwconv_ukernel_up4x25__sse
+- name: xnn_f32_dwconv_ukernel_up4x25__sse_acc2
+- name: xnn_f32_dwconv_ukernel_up8x25__sse
+- name: xnn_f32_dwconv_ukernel_up8x25__sse_acc2
 - name: xnn_f32_dwconv_ukernel_up4x9__sse
+- name: xnn_f32_dwconv_ukernel_up4x9__sse_acc2
+- name: xnn_f32_dwconv_ukernel_up8x9__sse
+- name: xnn_f32_dwconv_ukernel_up8x9__sse_acc2
 - name: xnn_f32_dwconv_ukernel_up4x4__sse
+- name: xnn_f32_dwconv_ukernel_up4x4__sse_acc2
+- name: xnn_f32_dwconv_ukernel_up8x4__sse
+- name: xnn_f32_dwconv_ukernel_up8x4__sse_acc2
 - name: xnn_f32_dwconv_ukernel_up4x25__psimd
+- name: xnn_f32_dwconv_ukernel_up4x25__psimd_acc2
+- name: xnn_f32_dwconv_ukernel_up8x25__psimd
+- name: xnn_f32_dwconv_ukernel_up8x25__psimd_acc2
 - name: xnn_f32_dwconv_ukernel_up4x9__psimd
+- name: xnn_f32_dwconv_ukernel_up4x9__psimd_acc2
+- name: xnn_f32_dwconv_ukernel_up8x9__psimd
+- name: xnn_f32_dwconv_ukernel_up8x9__psimd_acc2
 - name: xnn_f32_dwconv_ukernel_up4x4__psimd
+- name: xnn_f32_dwconv_ukernel_up4x4__psimd_acc2
+- name: xnn_f32_dwconv_ukernel_up8x4__psimd
+- name: xnn_f32_dwconv_ukernel_up8x4__psimd_acc2
 - name: xnn_f32_dwconv_ukernel_up1x4__scalar
+- name: xnn_f32_dwconv_ukernel_up1x4__scalar_acc2
+- name: xnn_f32_dwconv_ukernel_up2x4__scalar
+- name: xnn_f32_dwconv_ukernel_up2x4__scalar_acc2
 - name: xnn_f32_dwconv_ukernel_up1x9__scalar
+- name: xnn_f32_dwconv_ukernel_up1x9__scalar_acc2
+- name: xnn_f32_dwconv_ukernel_up2x9__scalar
+- name: xnn_f32_dwconv_ukernel_up2x9__scalar_acc2
 - name: xnn_f32_dwconv_ukernel_up1x25__scalar
+- name: xnn_f32_dwconv_ukernel_up1x25__scalar_acc2
+- name: xnn_f32_dwconv_ukernel_up2x25__scalar
+- name: xnn_f32_dwconv_ukernel_up2x25__scalar_acc2