Generate more tile sizes for QU8 gemm/igemm

- Enable 4x8 for 64 bit and 2x8 for 32 bit
- Generate QU8 2x8, 3x8, 5x8, 2x16, 3x16 and 5x16 GEMM and IGEMM microkernels
- QU8 used twice as many registers, so try different sizes that may perform better
- gemm bench and tests added new tile sizes.

PiperOrigin-RevId: 390299198
diff --git a/BUILD.bazel b/BUILD.bazel
index c3b141e..a069792 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3123,13 +3123,11 @@
     "src/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
     "src/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/2x8c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/2x8c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
 ]
 
 ALL_NEONDOT_MICROKERNEL_SRCS = [
@@ -3171,16 +3169,28 @@
     "src/qs8-igemm/gen/8x16c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/2x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/2x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/3x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/3x16c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/5x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/5x16c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/6x8c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/6x16c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/8x8c4-minmax-rndnu-neondot.c",
     "src/qu8-gemm/gen/8x16c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/2x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/2x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/3x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/3x16c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/5x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/5x16c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/6x8c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/6x16c4-minmax-rndnu-neondot.c",
     "src/qu8-igemm/gen/8x8c4-minmax-rndnu-neondot.c",