Additional tile sizes for QU8 neon lane microkernel.

- For aarch32, 2x16 is 1.5x faster than 4x8 with intrinsics.
- Only non-prefetch versions.
- Only RNDNU quantization is populated, not fp32.

PiperOrigin-RevId: 420719509
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31eff26..7db3eaa 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -262,14 +262,14 @@
   src/jit/memory.c)
 
 SET(JIT_AARCH32_SRCS
+  src/f32-gemm/4x8-aarch32-neon-cortex-a7.cc
   src/f32-gemm/4x8-aarch32-neon-cortex-a53.cc
   src/f32-gemm/4x8-aarch32-neon-cortex-a55.cc
-  src/f32-gemm/4x8-aarch32-neon-cortex-a7.cc
   src/f32-gemm/4x8-aarch32-neon-cortex-a75.cc
   src/f32-gemm/4x8-aarch32-neon-ld64.cc
+  src/f32-igemm/4x8-aarch32-neon-cortex-a7.cc
   src/f32-igemm/4x8-aarch32-neon-cortex-a53.cc
   src/f32-igemm/4x8-aarch32-neon-cortex-a55.cc
-  src/f32-igemm/4x8-aarch32-neon-cortex-a7.cc
   src/f32-igemm/4x8-aarch32-neon-cortex-a75.cc
   src/f32-igemm/4x8-aarch32-neon-ld64.cc
   src/qc8-gemm/4x8-fp32-aarch32-neonv8-mlal-lane-ld64.cc
@@ -1374,10 +1374,12 @@
   src/qu8-gavgpool/7x-minmax-neon-c8.c
   src/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/2x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-gemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/2x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-vadd/gen/minmax-neon-ld64-x16.c
@@ -2342,18 +2344,30 @@
   src/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
   src/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/2x8-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/2x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/3x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
   src/qu8-gemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
   src/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/6x8-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-gemm/gen/6x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
   src/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
   src/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/2x8-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/2x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/3x8-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/3x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
   src/qu8-igemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c
   src/qu8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
   src/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/6x8-minmax-rndnu-neon-mlal-lane.c
+  src/qu8-igemm/gen/6x16-minmax-rndnu-neon-mlal-lane.c
   src/qu8-requantization/fp32-neon.c
   src/qu8-requantization/gemmlowp-neon.c
   src/qu8-requantization/rndna-neon.c
@@ -4876,23 +4890,23 @@
 
 SET(ALL_AVX2_MICROKERNEL_SRCS
   src/f16-gemm/gen/1x8-minmax-avx2-broadcast.c
-  src/f16-gemm/gen/4x8-minmax-avx2-broadcast.c
-  src/f16-gemm/gen/5x8-minmax-avx2-broadcast.c
-  src/f16-gemm/gen/6x8-minmax-avx2-broadcast.c
-  src/f16-gemm/gen/7x8-minmax-avx2-broadcast.c
   src/f16-gemm/gen/1x16-minmax-avx2-broadcast.c
   src/f16-gemm/gen/3x16-minmax-avx2-broadcast.c
+  src/f16-gemm/gen/4x8-minmax-avx2-broadcast.c
   src/f16-gemm/gen/4x16-minmax-avx2-broadcast.c
+  src/f16-gemm/gen/5x8-minmax-avx2-broadcast.c
   src/f16-gemm/gen/5x16-minmax-avx2-broadcast.c
+  src/f16-gemm/gen/6x8-minmax-avx2-broadcast.c
+  src/f16-gemm/gen/7x8-minmax-avx2-broadcast.c
   src/f16-igemm/gen/1x8-minmax-avx2-broadcast.c
-  src/f16-igemm/gen/4x8-minmax-avx2-broadcast.c
-  src/f16-igemm/gen/5x8-minmax-avx2-broadcast.c
-  src/f16-igemm/gen/6x8-minmax-avx2-broadcast.c
-  src/f16-igemm/gen/7x8-minmax-avx2-broadcast.c
   src/f16-igemm/gen/1x16-minmax-avx2-broadcast.c
   src/f16-igemm/gen/3x16-minmax-avx2-broadcast.c
+  src/f16-igemm/gen/4x8-minmax-avx2-broadcast.c
   src/f16-igemm/gen/4x16-minmax-avx2-broadcast.c
+  src/f16-igemm/gen/5x8-minmax-avx2-broadcast.c
   src/f16-igemm/gen/5x16-minmax-avx2-broadcast.c
+  src/f16-igemm/gen/6x8-minmax-avx2-broadcast.c
+  src/f16-igemm/gen/7x8-minmax-avx2-broadcast.c
   src/f32-qs8-vcvt/gen/vcvt-avx2-x16.c
   src/f32-qs8-vcvt/gen/vcvt-avx2-x32.c
   src/f32-qs8-vcvt/gen/vcvt-avx2-x48.c