QS8 4x8 lane GEMM AArch32 microkernel for Cortex A53

- q4 and q5 for weights.  additional push of d12/d13.
- kc (r2) and lr pushed onto stack.
- load weights and expand in previous block

PiperOrigin-RevId: 425813466
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f29b72e..f73b573 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2503,19 +2503,13 @@
   src/u8-maxpool/9p8x-minmax-neon-c16.c
   src/u8-rmax/neon.c
   src/u8-vclamp/neon-x64.c
+  src/x8-transpose/gen/16x16-reuse-dec-zip-neon.c
+  src/x8-transpose/gen/16x16-reuse-mov-zip-neon.c
+  src/x8-transpose/gen/16x16-reuse-switch-zip-neon.c
   src/x8-zip/x2-neon.c
   src/x8-zip/x3-neon.c
   src/x8-zip/x4-neon.c
   src/x8-zip/xm-neon.c
-  src/x32-packx/x4-neon-st4.c
-  src/x32-unpool/neon.c
-  src/x32-zip/x2-neon.c
-  src/x32-zip/x3-neon.c
-  src/x32-zip/x4-neon.c
-  src/x32-zip/xm-neon.c
-  src/x8-transpose/gen/16x16-reuse-dec-zip-neon.c
-  src/x8-transpose/gen/16x16-reuse-mov-zip-neon.c
-  src/x8-transpose/gen/16x16-reuse-switch-zip-neon.c
   src/x16-transpose/gen/8x8-multi-dec-zip-neon.c
   src/x16-transpose/gen/8x8-multi-mov-zip-neon.c
   src/x16-transpose/gen/8x8-multi-switch-zip-neon.c
@@ -2523,6 +2517,7 @@
   src/x16-transpose/gen/8x8-reuse-mov-zip-neon.c
   src/x16-transpose/gen/8x8-reuse-multi-zip-neon.c
   src/x16-transpose/gen/8x8-reuse-switch-zip-neon.c
+  src/x32-packx/x4-neon-st4.c
   src/x32-transpose/gen/4x4-multi-dec-zip-neon.c
   src/x32-transpose/gen/4x4-multi-mov-zip-neon.c
   src/x32-transpose/gen/4x4-multi-multi-zip-neon.c
@@ -2531,6 +2526,11 @@
   src/x32-transpose/gen/4x4-reuse-mov-zip-neon.c
   src/x32-transpose/gen/4x4-reuse-multi-zip-neon.c
   src/x32-transpose/gen/4x4-reuse-switch-zip-neon.c
+  src/x32-unpool/neon.c
+  src/x32-zip/x2-neon.c
+  src/x32-zip/x3-neon.c
+  src/x32-zip/x4-neon.c
+  src/x32-zip/xm-neon.c
   src/xx-fill/neon-x64.c
   src/xx-pad/neon.c)
 
@@ -5805,7 +5805,9 @@
   src/f32-igemm/gen/4x8-minmax-aarch32-neon-prfm-cortex-a75.S
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-ld64.S
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neon-mlal-lane-prfm-ld64.S
+  src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-cortex-a53.S
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-ld64.S
+  src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-cortex-a53.S
   src/qc8-gemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S
   src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S
   src/qc8-gemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S
@@ -5815,7 +5817,9 @@
   src/qc8-igemm/gen/4x8-minmax-fp32-aarch32-neonv8-mlal-lane-prfm-ld64.S
   src/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-cortex-a55.S
   src/qc8-igemm/gen/4x8c4-minmax-fp32-aarch32-neondot-ld64.S
+  src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
   src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
+  src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
   src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
   src/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S
   src/qs8-gemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
@@ -5823,7 +5827,9 @@
   src/qs8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
   src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-cortex-a55.S
   src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
+  src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-cortex-a53.S
   src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
+  src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-cortex-a53.S
   src/qu8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
   src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
   src/qu8-igemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S)