QC8 NEON lane microkernels

- templates for all microkernel sizes for lane.
- 1x8 microkernel is needed before 4x8 assembly can be initialized.

PiperOrigin-RevId: 419763947
diff --git a/BUILD.bazel b/BUILD.bazel
index 084f5ea..241bd97 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -349,14 +349,14 @@
     "src/f32-vunary/gen/vabs-scalar-x4.c",
     "src/f32-vunary/gen/vneg-scalar-x4.c",
     "src/f32-vunary/gen/vsqr-scalar-x4.c",
-    "src/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
     "src/qc8-dwconv/gen/up1x25-minmax-fp32-scalar-imagic.c",
+    "src/qc8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
     "src/qc8-gemm/gen/1x2-minmax-fp32-scalar-imagic.c",
     "src/qc8-gemm/gen/2x2-minmax-fp32-scalar-imagic.c",
     "src/qc8-igemm/gen/1x2-minmax-fp32-scalar-imagic.c",
     "src/qc8-igemm/gen/2x2-minmax-fp32-scalar-imagic.c",
-    "src/qs8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
     "src/qs8-dwconv/gen/up1x25-minmax-fp32-scalar-imagic.c",
+    "src/qs8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
     "src/qs8-f32-vcvt/gen/vcvt-scalar-x1.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-scalar-c4.c",
     "src/qs8-gavgpool/gen/7x-minmax-scalar-c4.c",
@@ -370,8 +370,8 @@
     "src/qs8-vmulc/gen/minmax-fp32-scalar-x4.c",
     "src/qu8-avgpool/9p8x-minmax-scalar-c1.c",
     "src/qu8-avgpool/9x-minmax-scalar-c1.c",
-    "src/qu8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
     "src/qu8-dwconv/gen/up1x25-minmax-fp32-scalar-imagic.c",
+    "src/qu8-dwconv/gen/up2x9-minmax-fp32-scalar-imagic.c",
     "src/qu8-f32-vcvt/gen/vcvt-scalar-x1.c",
     "src/qu8-gavgpool/7p7x-minmax-scalar-c1.c",
     "src/qu8-gavgpool/7x-minmax-scalar-c1.c",
@@ -2943,6 +2943,8 @@
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-neon-mul16.c",
     "src/qc8-dwconv/gen/up32x9-minmax-fp32-neon-mul16.c",
     "src/qc8-dwconv/gen/up32x25-minmax-fp32-neon-mul16.c",
+    "src/qc8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c",
     "src/qc8-gemm/gen/1x8c2-minmax-fp32-neon-mlal-dup.c",
     "src/qc8-gemm/gen/1x8c2-minmax-fp32-neon-mlal-ld1r.c",
     "src/qc8-gemm/gen/1x8c2-minmax-fp32-neon-mlal-ld2r.c",
@@ -2953,7 +2955,10 @@
     "src/qc8-gemm/gen/1x8c4-minmax-fp32-neon-mlal-ld2r.c",
     "src/qc8-gemm/gen/1x8c4s2-minmax-fp32-neon-mlal.c",
     "src/qc8-gemm/gen/1x8c8-minmax-fp32-neon-mlal.c",
+    "src/qc8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane-prfm.c",
     "src/qc8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/2x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/2x8-minmax-fp32-neon-mlal-lane.c",
     "src/qc8-gemm/gen/2x8c2-minmax-fp32-neon-mlal-dup.c",
     "src/qc8-gemm/gen/2x8c2-minmax-fp32-neon-mlal-ld1r.c",
     "src/qc8-gemm/gen/2x8c2-minmax-fp32-neon-mlal-ld2r.c",
@@ -2964,7 +2969,22 @@
     "src/qc8-gemm/gen/2x8c4-minmax-fp32-neon-mlal-ld2r.c",
     "src/qc8-gemm/gen/2x8c4s2-minmax-fp32-neon-mlal.c",
     "src/qc8-gemm/gen/2x8c8-minmax-fp32-neon-mlal.c",
+    "src/qc8-gemm/gen/2x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/2x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/3x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/3x8-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/3x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/3x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane-prfm.c",
     "src/qc8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/6x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/6x8-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-gemm/gen/6x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/6x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c",
     "src/qc8-igemm/gen/1x8c2-minmax-fp32-neon-mlal-dup.c",
     "src/qc8-igemm/gen/1x8c2-minmax-fp32-neon-mlal-ld1r.c",
     "src/qc8-igemm/gen/1x8c2-minmax-fp32-neon-mlal-ld2r.c",
@@ -2975,7 +2995,10 @@
     "src/qc8-igemm/gen/1x8c4-minmax-fp32-neon-mlal-ld2r.c",
     "src/qc8-igemm/gen/1x8c4s2-minmax-fp32-neon-mlal.c",
     "src/qc8-igemm/gen/1x8c8-minmax-fp32-neon-mlal.c",
+    "src/qc8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane-prfm.c",
     "src/qc8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/2x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/2x8-minmax-fp32-neon-mlal-lane.c",
     "src/qc8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-dup.c",
     "src/qc8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-ld1r.c",
     "src/qc8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-ld2r.c",
@@ -2986,7 +3009,20 @@
     "src/qc8-igemm/gen/2x8c4-minmax-fp32-neon-mlal-ld2r.c",
     "src/qc8-igemm/gen/2x8c4s2-minmax-fp32-neon-mlal.c",
     "src/qc8-igemm/gen/2x8c8-minmax-fp32-neon-mlal.c",
+    "src/qc8-igemm/gen/2x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/2x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/3x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/3x8-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/3x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/3x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane-prfm.c",
     "src/qc8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/6x8-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/6x8-minmax-fp32-neon-mlal-lane.c",
+    "src/qc8-igemm/gen/6x16-minmax-fp32-neon-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/6x16-minmax-fp32-neon-mlal-lane.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mla8-ld64.c",
     "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mul8-ld64.c",
@@ -4041,6 +4077,8 @@
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-neonv8-mul16.c",
     "src/qc8-dwconv/gen/up32x9-minmax-fp32-neonv8-mul16.c",
     "src/qc8-dwconv/gen/up32x25-minmax-fp32-neonv8-mul16.c",
+    "src/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c",
     "src/qc8-gemm/gen/1x8c2-minmax-fp32-neonv8-mlal-dup.c",
     "src/qc8-gemm/gen/1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
     "src/qc8-gemm/gen/1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
@@ -4051,7 +4089,10 @@
     "src/qc8-gemm/gen/1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
     "src/qc8-gemm/gen/1x8c4s2-minmax-fp32-neonv8-mlal.c",
     "src/qc8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c",
+    "src/qc8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
     "src/qc8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/2x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/2x8-minmax-fp32-neonv8-mlal-lane.c",
     "src/qc8-gemm/gen/2x8c2-minmax-fp32-neonv8-mlal-dup.c",
     "src/qc8-gemm/gen/2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
     "src/qc8-gemm/gen/2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
@@ -4062,7 +4103,22 @@
     "src/qc8-gemm/gen/2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
     "src/qc8-gemm/gen/2x8c4s2-minmax-fp32-neonv8-mlal.c",
     "src/qc8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c",
+    "src/qc8-gemm/gen/2x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/2x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/3x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/3x8-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/3x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/3x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/4x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/4x8-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
     "src/qc8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/6x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/6x8-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-gemm/gen/6x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-gemm/gen/6x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/1x8-minmax-fp32-neonv8-mlal-lane.c",
     "src/qc8-igemm/gen/1x8c2-minmax-fp32-neonv8-mlal-dup.c",
     "src/qc8-igemm/gen/1x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
     "src/qc8-igemm/gen/1x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
@@ -4073,7 +4129,10 @@
     "src/qc8-igemm/gen/1x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
     "src/qc8-igemm/gen/1x8c4s2-minmax-fp32-neonv8-mlal.c",
     "src/qc8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal.c",
+    "src/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
     "src/qc8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/2x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/2x8-minmax-fp32-neonv8-mlal-lane.c",
     "src/qc8-igemm/gen/2x8c2-minmax-fp32-neonv8-mlal-dup.c",
     "src/qc8-igemm/gen/2x8c2-minmax-fp32-neonv8-mlal-ld1r.c",
     "src/qc8-igemm/gen/2x8c2-minmax-fp32-neonv8-mlal-ld2r.c",
@@ -4084,7 +4143,20 @@
     "src/qc8-igemm/gen/2x8c4-minmax-fp32-neonv8-mlal-ld2r.c",
     "src/qc8-igemm/gen/2x8c4s2-minmax-fp32-neonv8-mlal.c",
     "src/qc8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal.c",
+    "src/qc8-igemm/gen/2x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/2x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/3x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/3x8-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/3x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/3x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/4x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/4x8-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
     "src/qc8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/6x8-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/6x8-minmax-fp32-neonv8-mlal-lane.c",
+    "src/qc8-igemm/gen/6x16-minmax-fp32-neonv8-mlal-lane-prfm.c",
+    "src/qc8-igemm/gen/6x16-minmax-fp32-neonv8-mlal-lane.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-neonv8-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-neonv8-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-neonv8-mul16.c",