C2 QS8 microkernel using mull then mlal with KC loop of 16

PiperOrigin-RevId: 355524975
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3dda359..8a73686 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -772,10 +772,10 @@
   src/f32-hswish/gen/hswish-neon-x4.c
   src/f32-hswish/gen/hswish-neon-x8.c
   src/f32-hswish/gen/hswish-neon-x16.c
-  src/f32-ibilinear/gen/neon-c4.c
-  src/f32-ibilinear/gen/neon-c8.c
   src/f32-ibilinear-chw/gen/neon-p4.c
   src/f32-ibilinear-chw/gen/neon-p8.c
+  src/f32-ibilinear/gen/neon-c4.c
+  src/f32-ibilinear/gen/neon-c8.c
   src/f32-igemm/gen/1x8-minmax-neon-dup-ld64.c
   src/f32-igemm/gen/1x8-minmax-neon-lane-ld64.c
   src/f32-igemm/gen/1x8s4-minmax-neon.c
@@ -949,35 +949,43 @@
   src/qs8-gavgpool/gen/7x-minmax-neon-c32-acc2.c
   src/qs8-gemm/gen/1x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/1x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/1x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/1x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/2x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/2x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/2x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/2x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/3x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/3x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/3x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/3x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/4x8-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/4x8-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/4x16-minmax-neon-mlal-lane.c
   src/qs8-gemm/gen/4x16-minmax-neon-mull-addw-dup.c
+  src/qs8-gemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
-  src/qs8-gemm/gen/1x8c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/2x8c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/3x8c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/4x8c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/1x16c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/2x16c8-minmax-neon-mull-padal.c
-  src/qs8-gemm/gen/3x16c8-minmax-neon-mull-padal.c
   src/qs8-gemm/gen/4x16c8-minmax-neon-mull-padal.c
   src/qs8-igemm/gen/1x8-minmax-neon-mlal-lane.c
   src/qs8-igemm/gen/1x16-minmax-neon-mlal-lane.c
@@ -1058,10 +1066,10 @@
   src/f32-gemm/gen/6x8-minmax-neonfma-dup-ld128.c
   src/f32-gemm/gen/6x8s4-minmax-neonfma.c
   src/f32-gemm/gen/8x8s4-minmax-neonfma.c
-  src/f32-ibilinear/gen/neonfma-c4.c
-  src/f32-ibilinear/gen/neonfma-c8.c
   src/f32-ibilinear-chw/gen/neonfma-p4.c
   src/f32-ibilinear-chw/gen/neonfma-p8.c
+  src/f32-ibilinear/gen/neonfma-c4.c
+  src/f32-ibilinear/gen/neonfma-c8.c
   src/f32-igemm/gen/1x8-minmax-neonfma-dup-ld64.c
   src/f32-igemm/gen/1x8s4-minmax-neonfma.c
   src/f32-igemm/gen/4x8-minmax-neonfma-dup-ld64.c