QU8 C4 NEON Dot Product GEMM/IGEMM microkernels for Cortex A55r1

- adapted from QS8 4x16 microkernel to QU8 4x8
- 2 dot products per vector.  A * W and A * zero_point.
- unsigned dot products with 2 sets of accumulators.
- subtract zero point accumulators from accumulators outside loop.
- 4x8 GEMM and IGEMM.
- RNDNU, FP32 and GEMMLOWP support but only RNDNU used

PiperOrigin-RevId: 393028276
diff --git a/BUILD.bazel b/BUILD.bazel
index 715c655..0ba70ba 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -5647,12 +5647,14 @@
     "src/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
     "src/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld64.S",
     "src/qs8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
+    "src/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
     "src/qu8-gemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld64.S",
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
     "src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
+    "src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
     "src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld64.S",
     "src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",