Enable 4x16 QU8 dot production microkernels

- for ARM64 use 4x16.  Was 4x8.  20 accumulators.
- for ARM32 use 2x16.  Was 2x8.  10 accumulators.

PiperOrigin-RevId: 391423667
diff --git a/BUILD.bazel b/BUILD.bazel
index ab5a41f..946b743 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3146,12 +3146,12 @@
     "src/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
     "src/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
     "src/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
-    "src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-gemm/gen/2x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-igemm/gen/2x8c4-minmax-rndnu-neondot.c",
-    "src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/2x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/2x16c4-minmax-rndnu-neondot.c",
+    "src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
 ]
 
 ALL_NEONDOT_MICROKERNEL_SRCS = [