4x16 QS8 microkernel for Cortex A53

- GEMM and IGEMM
- prefetch and non-prefetch

PiperOrigin-RevId: 374297582
diff --git a/BUILD.bazel b/BUILD.bazel
index 48ba91b..3279af0 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -504,20 +504,20 @@
     "src/qs8-gavgpool/gen/7x-minmax-scalar-c2.c",
     "src/qs8-gavgpool/gen/7x-minmax-scalar-c4.c",
     "src/qs8-gemm/gen/1x2-minmax-scalar.c",
-    "src/qs8-gemm/gen/2x2-minmax-scalar.c",
-    "src/qs8-gemm/gen/3x2-minmax-scalar.c",
-    "src/qs8-gemm/gen/4x2-minmax-scalar.c",
     "src/qs8-gemm/gen/1x4-minmax-scalar.c",
+    "src/qs8-gemm/gen/2x2-minmax-scalar.c",
     "src/qs8-gemm/gen/2x4-minmax-scalar.c",
+    "src/qs8-gemm/gen/3x2-minmax-scalar.c",
     "src/qs8-gemm/gen/3x4-minmax-scalar.c",
+    "src/qs8-gemm/gen/4x2-minmax-scalar.c",
     "src/qs8-gemm/gen/4x4-minmax-scalar.c",
     "src/qs8-igemm/gen/1x2-minmax-scalar.c",
-    "src/qs8-igemm/gen/2x2-minmax-scalar.c",
-    "src/qs8-igemm/gen/3x2-minmax-scalar.c",
-    "src/qs8-igemm/gen/4x2-minmax-scalar.c",
     "src/qs8-igemm/gen/1x4-minmax-scalar.c",
+    "src/qs8-igemm/gen/2x2-minmax-scalar.c",
     "src/qs8-igemm/gen/2x4-minmax-scalar.c",
+    "src/qs8-igemm/gen/3x2-minmax-scalar.c",
     "src/qs8-igemm/gen/3x4-minmax-scalar.c",
+    "src/qs8-igemm/gen/4x2-minmax-scalar.c",
     "src/qs8-igemm/gen/4x4-minmax-scalar.c",
     "src/qs8-requantization/fp32-scalar-lrintf.c",
     "src/qs8-requantization/fp32-scalar-magic.c",
@@ -748,13 +748,13 @@
     "src/f32-vlrelu/gen/vlrelu-wasm-x1.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x2.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x4.c",
+    "src/f32-vmulcaddc/gen/c1-minmax-wasm-2x.c",
+    "src/f32-vmulcaddc/gen/c2-minmax-wasm-2x.c",
+    "src/f32-vmulcaddc/gen/c4-minmax-wasm-2x.c",
     "src/f32-vrelu/gen/vrelu-wasm-x1.c",
     "src/f32-vrelu/gen/vrelu-wasm-x2.c",
     "src/f32-vrelu/gen/vrelu-wasm-x4.c",
     "src/f32-vrelu/gen/vrelu-wasm-x8.c",
-    "src/f32-vmulcaddc/gen/c1-minmax-wasm-2x.c",
-    "src/f32-vmulcaddc/gen/c2-minmax-wasm-2x.c",
-    "src/f32-vmulcaddc/gen/c4-minmax-wasm-2x.c",
 ]
 
 WASMSIMD_UKERNELS = [
@@ -2326,10 +2326,10 @@
     "src/f16-vclamp/gen/vclamp-neonfp16arith-x16.c",
     "src/f16-vhswish/gen/vhswish-neonfp16arith-x8.c",
     "src/f16-vhswish/gen/vhswish-neonfp16arith-x16.c",
-    "src/f16-vrelu/gen/vrelu-neonfp16arith-x8.c",
-    "src/f16-vrelu/gen/vrelu-neonfp16arith-x16.c",
     "src/f16-vmulcaddc/gen/c8-minmax-neonfp16arith-2x.c",
     "src/f16-vmulcaddc/gen/c16-minmax-neonfp16arith-2x.c",
+    "src/f16-vrelu/gen/vrelu-neonfp16arith-x8.c",
+    "src/f16-vrelu/gen/vrelu-neonfp16arith-x16.c",
 ]
 
 NEONDOT_UKERNELS = [
@@ -3768,25 +3768,29 @@
     "src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S",
-    "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
-    "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
     "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
+    "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
+    "src/qs8-gemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
     "src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
     "src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal.S",
+    "src/qs8-gemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S",
+    "src/qs8-gemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
     "src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S",
     "src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S",
-    "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
-    "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
     "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
+    "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal-prfm.S",
+    "src/qs8-igemm/gen/1x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S",
     "src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
     "src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm.S",
     "src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal.S",
+    "src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-cortex-a53.S",
+    "src/qs8-igemm/gen/4x16-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
 ]
 
 INTERNAL_MICROKERNEL_HDRS = [