QC8 4x8 dot product GEMM AArch32 microkernel for Cortex A55

- FP32 quantization
- reduce stack push by 16 bytes improving QS8 as well as QC8 GEMM.

PiperOrigin-RevId: 424004581
diff --git a/BUILD.bazel b/BUILD.bazel
index 2e7e083..9238d23 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7052,6 +7052,7 @@
     "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S",
     "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S",
     "src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S",
+    "src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S",
     "src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S",
     "src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S",
     "src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S",