QU8 4x16C4 NEON Dot Product GEMM/IGEMM microkernels for Cortex A55r1

- adapted from QS8 4x16 microkernel
- UDOT per MR for zero point
- subtract zero point accumulators from accumulators outside loop.
- 4x16 GEMM and IGEMM.
- RNDNU, FP32 and GEMMLOWP support but only RNDNU used

PiperOrigin-RevId: 394143891
diff --git a/BUILD.bazel b/BUILD.bazel
index 19cae49..e99b867 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1823,36 +1823,36 @@
     "src/qs8-gavgpool/gen/7x-minmax-wasmsimd-c24-acc2.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
-    "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd-mul16.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/2x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-gemm/gen/2x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
-    "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd-mul16.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/3x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-wasmsimd-mul16-ld128.c",
-    "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd-mul16.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/4x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
+    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-gemm/gen/4x4c8-minmax-fp32-wasmsimd-dot16x2-ld128.c",
-    "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-gemm/gen/4x4c8-xw-minmax-fp32-wasmsimd-dot16x2.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld64.c",
     "src/qs8-igemm/gen/1x4c2-minmax-fp32-wasmsimd-dot16x2-ld128.c",
@@ -5767,6 +5767,7 @@
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
+    "src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
     "src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
     "src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
     "src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-ld64.S",
@@ -5774,6 +5775,7 @@
     "src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S",
     "src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
+    "src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
     "src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
 ]