QC8 4x8 dot product GEMM AArch32 microkernel for Cortex A55

- FP32 quantization
- reduce stack push by 16 bytes improving QS8 as well as QC8 GEMM.

PiperOrigin-RevId: 424004581
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3920fb5..ba8859c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5786,6 +5786,7 @@
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S
+  src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S
   src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S
   src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S
   src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S