Add 16x1 SSE f32-SpMM kernels, which is faster than the existing 8x1 kernel.
Add 4x1, 8x1, 16x1 PSIMD f32-SpMM kernels.

PiperOrigin-RevId: 315574229
diff --git a/BUILD.bazel b/BUILD.bazel
index afde8af..b9bf34a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -517,6 +517,9 @@
     "src/f32-igemm/gen/6x8-minmax-psimd-splat.c",
     "src/f32-igemm/gen/6x8s4-minmax-psimd.c",
     "src/f32-maxpool/9p8x-minmax-psimd-c4.c",
+    "src/f32-spmm/gen/4x1-minmax-psimd.c",
+    "src/f32-spmm/gen/8x1-minmax-psimd.c",
+    "src/f32-spmm/gen/16x1-minmax-psimd.c",
     "src/f32-pavgpool/9p8x-minmax-psimd-c4.c",
     "src/f32-pavgpool/9x-minmax-psimd-c4.c",
     "src/f32-ppmm/gen/4x8-minmax-psimd.c",
@@ -1221,6 +1224,7 @@
     "src/f32-rmax/sse.c",
     "src/f32-spmm/gen/4x1-minmax-sse.c",
     "src/f32-spmm/gen/8x1-minmax-sse.c",
+    "src/f32-spmm/gen/16x1-minmax-sse.c",
     "src/f32-vbinary/gen/vadd-minmax-sse-x4.c",
     "src/f32-vbinary/gen/vadd-minmax-sse-x8.c",
     "src/f32-vbinary/gen/vaddc-minmax-sse-x4.c",