QS8 C4S2 Neon GEMM/IGEMM microkernels

PiperOrigin-RevId: 409348021
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 869892c..d9802cd 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1510,21 +1510,25 @@
   src/qc8-gemm/gen/1x8c2-minmax-fp32-neon-mlal-padal-dup.c
   src/qc8-gemm/gen/1x8c2s4-minmax-fp32-neon-mlal-padal.c
   src/qc8-gemm/gen/1x8c4-minmax-fp32-neon-mlal-padal-dup.c
+  src/qc8-gemm/gen/1x8c4s2-minmax-fp32-neon-mlal-padal.c
   src/qc8-gemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
   src/qc8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
   src/qc8-gemm/gen/2x8c2-minmax-fp32-neon-mlal-padal-dup.c
   src/qc8-gemm/gen/2x8c2s4-minmax-fp32-neon-mlal-padal.c
   src/qc8-gemm/gen/2x8c4-minmax-fp32-neon-mlal-padal-dup.c
+  src/qc8-gemm/gen/2x8c4s2-minmax-fp32-neon-mlal-padal.c
   src/qc8-gemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
   src/qc8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
   src/qc8-igemm/gen/1x8c2-minmax-fp32-neon-mlal-padal-dup.c
   src/qc8-igemm/gen/1x8c2s4-minmax-fp32-neon-mlal-padal.c
   src/qc8-igemm/gen/1x8c4-minmax-fp32-neon-mlal-padal-dup.c
+  src/qc8-igemm/gen/1x8c4s2-minmax-fp32-neon-mlal-padal.c
   src/qc8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
   src/qc8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
   src/qc8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-padal-dup.c
   src/qc8-igemm/gen/2x8c2s4-minmax-fp32-neon-mlal-padal.c
   src/qc8-igemm/gen/2x8c4-minmax-fp32-neon-mlal-padal-dup.c
+  src/qc8-igemm/gen/2x8c4s2-minmax-fp32-neon-mlal-padal.c
   src/qc8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
   src/qc8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
   src/qs8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c
@@ -1584,6 +1588,9 @@
   src/qs8-gemm/gen/1x8c4-minmax-fp32-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/1x8c4s2-minmax-fp32-neon-mlal-padal.c
+  src/qs8-gemm/gen/1x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/1x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -1600,6 +1607,8 @@
   src/qs8-gemm/gen/1x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/1x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/1x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/1x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/1x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-gemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1615,6 +1624,9 @@
   src/qs8-gemm/gen/2x8c4-minmax-fp32-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/2x8c4s2-minmax-fp32-neon-mlal-padal.c
+  src/qs8-gemm/gen/2x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/2x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -1628,6 +1640,8 @@
   src/qs8-gemm/gen/2x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/2x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/2x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/2x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/2x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-gemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1639,6 +1653,8 @@
   src/qs8-gemm/gen/3x8c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/3x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/3x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/3x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/3x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-gemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1650,6 +1666,8 @@
   src/qs8-gemm/gen/3x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/3x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/3x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/3x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/3x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-gemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1661,6 +1679,8 @@
   src/qs8-gemm/gen/4x8c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/4x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/4x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/4x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/4x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-gemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1675,6 +1695,8 @@
   src/qs8-gemm/gen/4x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/4x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-gemm/gen/4x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-gemm/gen/4x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-gemm/gen/4x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-gemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-gemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1693,6 +1715,9 @@
   src/qs8-igemm/gen/1x8c4-minmax-fp32-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/1x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/1x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/1x8c4s2-minmax-fp32-neon-mlal-padal.c
+  src/qs8-igemm/gen/1x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/1x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
   src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -1709,6 +1734,8 @@
   src/qs8-igemm/gen/1x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/1x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/1x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/1x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/1x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-igemm/gen/1x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1724,6 +1751,9 @@
   src/qs8-igemm/gen/2x8c4-minmax-fp32-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/2x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/2x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/2x8c4s2-minmax-fp32-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
   src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -1737,6 +1767,8 @@
   src/qs8-igemm/gen/2x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/2x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/2x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/2x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/2x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-igemm/gen/2x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1748,6 +1780,8 @@
   src/qs8-igemm/gen/3x8c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/3x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/3x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/3x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-igemm/gen/3x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1759,6 +1793,8 @@
   src/qs8-igemm/gen/3x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/3x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/3x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/3x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/3x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-igemm/gen/3x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1770,6 +1806,8 @@
   src/qs8-igemm/gen/4x8c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/4x8c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/4x8c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/4x8c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x8c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -1784,6 +1822,8 @@
   src/qs8-igemm/gen/4x16c2s4-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/4x16c4-minmax-rndnu-neon-mlal-padal-dup.c
   src/qs8-igemm/gen/4x16c4-minmax-rndnu-neon-mull-padal-dup.c
+  src/qs8-igemm/gen/4x16c4s2-minmax-rndnu-neon-mlal-padal.c
+  src/qs8-igemm/gen/4x16c4s2-minmax-rndnu-neon-mull-padal.c
   src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
   src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
   src/qs8-igemm/gen/4x16c16-minmax-gemmlowp-neon-mlal-padal.c
@@ -2338,21 +2378,25 @@
   src/qc8-gemm/gen/1x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qc8-gemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-gemm/gen/1x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qc8-gemm/gen/1x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
   src/qc8-gemm/gen/2x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qc8-gemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-gemm/gen/2x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qc8-gemm/gen/2x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
   src/qc8-igemm/gen/1x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qc8-igemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-igemm/gen/1x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qc8-igemm/gen/1x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
   src/qc8-igemm/gen/2x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qc8-igemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-igemm/gen/2x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qc8-igemm/gen/2x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
   src/qs8-dwconv/gen/up8x9-minmax-fp32-neonv8-mul16.c
@@ -2366,21 +2410,25 @@
   src/qs8-gemm/gen/1x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qs8-gemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-gemm/gen/1x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qs8-gemm/gen/1x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
   src/qs8-gemm/gen/2x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qs8-gemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-gemm/gen/2x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qs8-gemm/gen/2x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
   src/qs8-igemm/gen/1x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qs8-igemm/gen/1x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-igemm/gen/1x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qs8-igemm/gen/1x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
   src/qs8-igemm/gen/2x8c2-minmax-fp32-neonv8-mlal-padal-dup.c
   src/qs8-igemm/gen/2x8c2s4-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-igemm/gen/2x8c4-minmax-fp32-neonv8-mlal-padal-dup.c
+  src/qs8-igemm/gen/2x8c4s2-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
   src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
   src/qs8-vmul/gen/minmax-fp32-neonv8-ld64-x8.c
@@ -3543,11 +3591,11 @@
 
 SET(PROD_AVX_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c
-  src/f32-f16-vcvt/gen/vcvt-avx-x24.c
   src/f32-dwconv/gen/up8x25-minmax-avx.c
   src/f32-dwconv/gen/up16x3-minmax-avx.c
   src/f32-dwconv/gen/up16x4-minmax-avx.c
   src/f32-dwconv/gen/up16x9-minmax-avx.c
+  src/f32-f16-vcvt/gen/vcvt-avx-x24.c
   src/f32-gemm/gen/1x16-minmax-avx-broadcast.c
   src/f32-gemm/gen/5x16-minmax-avx-broadcast.c
   src/f32-igemm/gen/1x16-minmax-avx-broadcast.c