PRFM variant of QS8 C8 Neon microkernel.

PiperOrigin-RevId: 368311965
diff --git a/BUILD.bazel b/BUILD.bazel
index 2b8d528..fabe13a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2376,10 +2376,10 @@
     "src/f32-gemm/gen/5x8s4-minmax-sse.c",
     "src/f32-hswish/gen/hswish-sse-x4.c",
     "src/f32-hswish/gen/hswish-sse-x8.c",
-    "src/f32-ibilinear/gen/sse-c4.c",
-    "src/f32-ibilinear/gen/sse-c8.c",
     "src/f32-ibilinear-chw/gen/sse-p4.c",
     "src/f32-ibilinear-chw/gen/sse-p8.c",
+    "src/f32-ibilinear/gen/sse-c4.c",
+    "src/f32-ibilinear/gen/sse-c8.c",
     "src/f32-igemm/gen/1x8-minmax-sse-dup.c",
     "src/f32-igemm/gen/1x8-minmax-sse-load1.c",
     "src/f32-igemm/gen/1x8s4-minmax-sse.c",
@@ -2725,10 +2725,10 @@
     "src/math/roundu-sse41.c",
     "src/math/roundz-sse41.c",
     "src/qs8-dwconv/gen/up8x9-minmax-sse41-mul16.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-sse41-mul16.c",
-    "src/qs8-dwconv/gen/up24x9-minmax-sse41-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-sse41-mul32.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-sse41-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-sse41-mul32.c",
+    "src/qs8-dwconv/gen/up24x9-minmax-sse41-mul16.c",
     "src/qs8-dwconv/gen/up24x9-minmax-sse41-mul32.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-sse41-c8-acc2.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-sse41-c16-acc2.c",
@@ -2932,11 +2932,19 @@
     "src/f32-vunary/gen/vneg-avx-x16.c",
     "src/f32-vunary/gen/vsqr-avx-x8.c",
     "src/f32-vunary/gen/vsqr-avx-x16.c",
+    "src/math/exp-avx-rr2-p5.c",
+    "src/math/expm1minus-avx-rr2-lut4-p4-perm.c",
+    "src/math/expm1minus-avx-rr2-lut16-p3.c",
+    "src/math/expm1minus-avx-rr2-p6.c",
+    "src/math/sigmoid-avx-rr2-lut64-p2-div.c",
+    "src/math/sigmoid-avx-rr2-p5-div.c",
+    "src/math/sigmoid-avx-rr2-p5-nr1.c",
+    "src/math/sigmoid-avx-rr2-p5-nr2.c",
     "src/qs8-dwconv/gen/up8x9-minmax-avx-mul16.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-avx-mul16.c",
-    "src/qs8-dwconv/gen/up24x9-minmax-avx-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-avx-mul32.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-avx-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-avx-mul32.c",
+    "src/qs8-dwconv/gen/up24x9-minmax-avx-mul16.c",
     "src/qs8-dwconv/gen/up24x9-minmax-avx-mul32.c",
     "src/qs8-gemm/gen/1x4c2-minmax-avx-ld64.c",
     "src/qs8-gemm/gen/1x4c2-minmax-avx-ld128.c",
@@ -2989,14 +2997,6 @@
     "src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x16.c",
     "src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x24.c",
     "src/qs8-vaddc/gen/minmax-avx-mul32-ld32-x32.c",
-    "src/math/exp-avx-rr2-p5.c",
-    "src/math/expm1minus-avx-rr2-lut4-p4-perm.c",
-    "src/math/expm1minus-avx-rr2-lut16-p3.c",
-    "src/math/expm1minus-avx-rr2-p6.c",
-    "src/math/sigmoid-avx-rr2-lut64-p2-div.c",
-    "src/math/sigmoid-avx-rr2-p5-div.c",
-    "src/math/sigmoid-avx-rr2-p5-nr1.c",
-    "src/math/sigmoid-avx-rr2-p5-nr2.c",
 ]
 
 XOP_UKERNELS = [
@@ -3667,20 +3667,22 @@
     "src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S",
     "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a57.S",
     "src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S",
+    "src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S",
     "src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S",
-    "src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S",
-    "src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S",
     "src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S",
+    "src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm.S",
+    "src/qs8-gemm/gen/2x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/1x8c8-aarch64-neon-mlal-padal.S",
-    "src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S",
     "src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S",
     "src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S",
+    "src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal-prfm.S",
+    "src/qs8-igemm/gen/2x8c8-aarch64-neon-mlal-padal.S",
 ]
 
 INTERNAL_MICROKERNEL_HDRS = [