Add 16x1 SSE f32-SpMM kernels, which is faster than the existing 8x1 kernel.
Add 4x1, 8x1, 16x1 PSIMD f32-SpMM kernels.

PiperOrigin-RevId: 315574229
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 84dc5c6..4ff59f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -529,6 +529,9 @@
   src/f32-prelu/gen/psimd-2x4.c
   src/f32-prelu/gen/psimd-2x8.c
   src/f32-rmax/psimd.c
+  src/f32-spmm/gen/4x1-minmax-psimd.c
+  src/f32-spmm/gen/8x1-minmax-psimd.c
+  src/f32-spmm/gen/16x1-minmax-psimd.c
   src/f32-vbinary/gen/vadd-minmax-psimd-x4.c
   src/f32-vbinary/gen/vadd-minmax-psimd-x8.c
   src/f32-vbinary/gen/vaddc-minmax-psimd-x4.c
@@ -1219,6 +1222,7 @@
   src/f32-rmax/sse.c
   src/f32-spmm/gen/4x1-minmax-sse.c
   src/f32-spmm/gen/8x1-minmax-sse.c
+  src/f32-spmm/gen/16x1-minmax-sse.c
   src/f32-vbinary/gen/vadd-minmax-sse-x4.c
   src/f32-vbinary/gen/vadd-minmax-sse-x8.c
   src/f32-vbinary/gen/vaddc-minmax-sse-x4.c