Add output_stride argument in SpMM microkernels

Fix multi-threaded sparse 1x1 NCHW Convolution

PiperOrigin-RevId: 342694850
diff --git a/test/f32-spmm-minmax.cc b/test/f32-spmm-minmax.cc
index 01aad66..eccf471 100644
--- a/test/f32-spmm-minmax.cc
+++ b/test/f32-spmm-minmax.cc
@@ -114,6 +114,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -324,6 +341,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X2__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 3) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(2)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x2__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X2__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 3) {
@@ -534,6 +568,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X4__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 20; n += 5) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x4__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X4__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 20; n += 5) {
@@ -699,6 +750,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__NEONFMA_PIPELINED, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__neonfma_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__NEONFMA_PIPELINED, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -892,6 +960,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__NEONFMA_X2, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__neonfma_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__NEONFMA_X2, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -1057,6 +1142,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -1267,6 +1369,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X2__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 3) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(2)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x2__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X2__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 3) {
@@ -1477,6 +1596,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X4__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 20; n += 5) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(4)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x4__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X4__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 20; n += 5) {
@@ -1642,6 +1778,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__NEONFMA_PIPELINED, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__neonfma_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__NEONFMA_PIPELINED, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -1835,6 +1988,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__NEONFMA_X2, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__neonfma_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__NEONFMA_X2, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -2000,6 +2170,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_12X1__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(12)
+          .nr(1)
+          .m(24)
+          .n(n)
+          .k(k)
+          .output_stride(29)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_12x1__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_12X1__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -2210,6 +2397,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_12X2__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 3) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(12)
+          .nr(2)
+          .m(24)
+          .n(n)
+          .k(k)
+          .output_stride(29)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_12x2__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_12X2__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 3) {
@@ -2420,6 +2624,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_12X4__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 20; n += 5) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(12)
+          .nr(4)
+          .m(24)
+          .n(n)
+          .k(k)
+          .output_stride(29)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_12x4__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_12X4__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 20; n += 5) {
@@ -2585,6 +2806,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -2795,6 +3033,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X2__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 3) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(2)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x2__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X2__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 3) {
@@ -3005,6 +3260,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X4__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 20; n += 5) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(4)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x4__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X4__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 20; n += 5) {
@@ -3170,6 +3442,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__NEONFMA_PIPELINED, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__neonfma_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__NEONFMA_PIPELINED, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -3363,6 +3652,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__NEONFMA_X2, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__neonfma_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__NEONFMA_X2, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -3528,6 +3834,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -3738,6 +4061,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X2__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 3) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(2)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x2__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X2__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 3) {
@@ -3948,6 +4288,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X4__NEONFMA, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 20; n += 5) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(4)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x4__neonfma);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X4__NEONFMA, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 20; n += 5) {
@@ -4113,6 +4470,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__NEONFMA_PIPELINED, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__NEONFMA_PIPELINED, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -4306,6 +4680,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__NEONFMA_X2, output_stride) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__neonfma_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__NEONFMA_X2, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -4471,6 +4862,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__SSE, output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__sse);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__SSE, qmin) {
     TEST_REQUIRES_X86_SSE;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -4636,6 +5044,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__SSE, output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__sse);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__SSE, qmin) {
     TEST_REQUIRES_X86_SSE;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -4801,6 +5226,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__SSE, output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__sse);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__SSE, qmin) {
     TEST_REQUIRES_X86_SSE;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -4966,6 +5408,23 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__SSE, output_stride) {
+    TEST_REQUIRES_X86_SSE;
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__sse);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__SSE, qmin) {
     TEST_REQUIRES_X86_SSE;
     for (uint32_t n = 1; n < 10; n += 2) {
@@ -5125,6 +5584,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_arm);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -5280,6 +5755,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_arm);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -5435,6 +5926,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_arm);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -5590,6 +6097,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -5745,6 +6268,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_x86);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -5900,6 +6439,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_x86);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -6055,6 +6610,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -6210,6 +6781,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -6391,6 +6978,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_arm_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6572,6 +7175,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_arm_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6753,6 +7372,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_arm_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6934,6 +7569,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7115,6 +7766,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_x86_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7296,6 +7963,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_x86_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7477,6 +8160,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7658,6 +8357,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7839,6 +8554,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_arm_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -8020,6 +8751,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_arm_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -8201,6 +8948,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_arm_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -8382,6 +9145,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -8563,6 +9342,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_x86_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -8744,6 +9539,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_x86_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -8925,6 +9736,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9106,6 +9933,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_X4, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 20; k += 5) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86_x4);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_X4, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9261,6 +10104,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_arm_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -9416,6 +10275,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_arm_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -9571,6 +10446,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_arm_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -9726,6 +10617,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -9881,6 +10788,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_x86_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -10036,6 +10959,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_x86_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -10191,6 +11130,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -10346,6 +11301,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_PIPELINED, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 5; k += 2) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86_pipelined);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_PIPELINED, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 5; k += 2) {
@@ -10527,6 +11498,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_arm_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_ARM_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10708,6 +11695,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_arm_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_ARM_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10889,6 +11892,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_arm_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_ARM_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11070,6 +12089,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_ARM_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11251,6 +12286,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(4)
+          .nr(1)
+          .m(8)
+          .n(n)
+          .k(k)
+          .output_stride(11)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_4x1__wasmsimd_x86_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_4X1__WASMSIMD_X86_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11432,6 +12483,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(8)
+          .nr(1)
+          .m(16)
+          .n(n)
+          .k(k)
+          .output_stride(19)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_8x1__wasmsimd_x86_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_8X1__WASMSIMD_X86_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11613,6 +12680,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(16)
+          .nr(1)
+          .m(32)
+          .n(n)
+          .k(k)
+          .output_stride(37)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_16X1__WASMSIMD_X86_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11794,6 +12877,22 @@
     }
   }
 
+  TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_PIPELINED_X2, output_stride) {
+    for (uint32_t n = 1; n < 10; n += 2) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        SpMMMicrokernelTester()
+          .mr(32)
+          .nr(1)
+          .m(64)
+          .n(n)
+          .k(k)
+          .output_stride(67)
+          .sparsity(0.0f)
+          .Test(xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86_pipelined_x2);
+      }
+    }
+  }
+
   TEST(F32_SPMM_MINMAX_32X1__WASMSIMD_X86_PIPELINED_X2, qmin) {
     for (uint32_t n = 1; n < 10; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11948,6 +13047,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_1X1__SCALAR, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(1)
+        .nr(1)
+        .m(2)
+        .n(n)
+        .k(k)
+        .output_stride(5)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_1x1__scalar, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_1X1__SCALAR, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -12100,6 +13215,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_1X1__SCALAR_PIPELINED, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(1)
+        .nr(1)
+        .m(2)
+        .n(n)
+        .k(k)
+        .output_stride(5)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_1x1__scalar_pipelined, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_1X1__SCALAR_PIPELINED, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -12252,6 +13383,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_2X1__SCALAR, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(2)
+        .nr(1)
+        .m(4)
+        .n(n)
+        .k(k)
+        .output_stride(7)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_2x1__scalar, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_2X1__SCALAR, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -12404,6 +13551,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_2X1__SCALAR_PIPELINED, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(2)
+        .nr(1)
+        .m(4)
+        .n(n)
+        .k(k)
+        .output_stride(7)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_2x1__scalar_pipelined, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_2X1__SCALAR_PIPELINED, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -12556,6 +13719,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_4X1__SCALAR, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(4)
+        .nr(1)
+        .m(8)
+        .n(n)
+        .k(k)
+        .output_stride(11)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_4x1__scalar, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_4X1__SCALAR, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -12708,6 +13887,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_4X1__SCALAR_PIPELINED, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(4)
+        .nr(1)
+        .m(8)
+        .n(n)
+        .k(k)
+        .output_stride(11)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_4x1__scalar_pipelined, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_4X1__SCALAR_PIPELINED, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -12860,6 +14055,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_8X1__SCALAR, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(8)
+        .nr(1)
+        .m(16)
+        .n(n)
+        .k(k)
+        .output_stride(19)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_8x1__scalar, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_8X1__SCALAR, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -13012,6 +14223,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_8X1__SCALAR_PIPELINED, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 2) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(8)
+        .nr(1)
+        .m(16)
+        .n(n)
+        .k(k)
+        .output_stride(19)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_8x1__scalar_pipelined, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_8X1__SCALAR_PIPELINED, qmin) {
   for (uint32_t n = 1; n < 10; n += 2) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -13206,6 +14433,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_8X2__SCALAR, output_stride) {
+  for (uint32_t n = 1; n < 10; n += 3) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(8)
+        .nr(2)
+        .m(16)
+        .n(n)
+        .k(k)
+        .output_stride(19)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_8x2__scalar, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_8X2__SCALAR, qmin) {
   for (uint32_t n = 1; n < 10; n += 3) {
     for (size_t k = 1; k <= 5; k += 2) {
@@ -13400,6 +14643,22 @@
   }
 }
 
+TEST(F32_SPMM_MINMAX_8X4__SCALAR, output_stride) {
+  for (uint32_t n = 1; n < 20; n += 5) {
+    for (size_t k = 1; k <= 5; k += 2) {
+      SpMMMicrokernelTester()
+        .mr(8)
+        .nr(4)
+        .m(16)
+        .n(n)
+        .k(k)
+        .output_stride(19)
+        .sparsity(0.0f)
+        .Test(xnn_f32_spmm_minmax_ukernel_8x4__scalar, SpMMMicrokernelTester::Variant::Scalar);
+    }
+  }
+}
+
 TEST(F32_SPMM_MINMAX_8X4__SCALAR, qmin) {
   for (uint32_t n = 1; n < 20; n += 5) {
     for (size_t k = 1; k <= 5; k += 2) {