QS8/QC8 DWCONV NEON MUL8/MLA8 microkernels using 128-bit loads

PiperOrigin-RevId: 386522151
diff --git a/BUILD.bazel b/BUILD.bazel
index 27ab07c..c784f40 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1929,17 +1929,21 @@
     "src/math/sqrt-neon-nr1rsqrts.c",
     "src/math/sqrt-neon-nr2rsqrts.c",
     "src/math/sqrt-neon-nr3rsqrts.c",
-    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neon-mla8.c",
-    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neon-mul8.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neon-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neon-mul8-ld64.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c",
-    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mla8.c",
-    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mul8.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mul8-ld64.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-neon-mul16.c",
-    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mla8.c",
-    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mul8.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mla8-ld128.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mul8-ld64.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mul8-ld128.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-neon-mul16.c",
-    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mla8.c",
-    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mul8.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mla8-ld128.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mul8-ld64.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mul8-ld128.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-neon-mul16.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-neon-mul16.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-neon-mul16.c",
@@ -1959,23 +1963,27 @@
     "src/qc8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c",
     "src/qs8-dwconv/gen/up8x9-minmax-gemmlowp-neon-mul16.c",
-    "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mla8.c",
-    "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mul8.c",
+    "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mla8-ld64.c",
+    "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mul8-ld64.c",
     "src/qs8-dwconv/gen/up8x9-minmax-rndnu-neon-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-neon-mul16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-gemmlowp-neon-mul16.c",
-    "src/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mla8.c",
-    "src/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mul8.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mla8-ld64.c",
+    "src/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mul8-ld64.c",
     "src/qs8-dwconv/gen/up8x25-minmax-rndnu-neon-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-fp32-neon-mul16.c",
     "src/qs8-dwconv/gen/up16x9-minmax-gemmlowp-neon-mul16.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mla8.c",
-    "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mul8.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mla8-ld64.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mla8-ld128.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mul8-ld64.c",
+    "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mul8-ld128.c",
     "src/qs8-dwconv/gen/up16x9-minmax-rndnu-neon-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-fp32-neon-mul16.c",
     "src/qs8-dwconv/gen/up16x25-minmax-gemmlowp-neon-mul16.c",
-    "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mla8.c",
-    "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mul8.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mla8-ld64.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mla8-ld128.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mul8-ld64.c",
+    "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mul8-ld128.c",
     "src/qs8-dwconv/gen/up16x25-minmax-rndnu-neon-mul16.c",
     "src/qs8-dwconv/gen/up24x9-minmax-fp32-neon-mul16.c",
     "src/qs8-dwconv/gen/up24x9-minmax-gemmlowp-neon-mul16.c",
@@ -2541,17 +2549,21 @@
     "src/math/roundne-neonv8.c",
     "src/math/roundu-neonv8.c",
     "src/math/roundz-neonv8.c",
-    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neonv8-mla8.c",
-    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neonv8-mul8.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neonv8-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up8x9-minmax-fp32-neonv8-mul8-ld64.c",
     "src/qc8-dwconv/gen/up8x9-minmax-fp32-neonv8-mul16.c",
-    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mla8.c",
-    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mul8.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mul8-ld64.c",
     "src/qc8-dwconv/gen/up8x25-minmax-fp32-neonv8-mul16.c",
-    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mla8.c",
-    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mul8.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mla8-ld128.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mul8-ld64.c",
+    "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mul8-ld128.c",
     "src/qc8-dwconv/gen/up16x9-minmax-fp32-neonv8-mul16.c",
-    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mla8.c",
-    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mul8.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mla8-ld64.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mla8-ld128.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mul8-ld64.c",
+    "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mul8-ld128.c",
     "src/qc8-dwconv/gen/up16x25-minmax-fp32-neonv8-mul16.c",
     "src/qc8-dwconv/gen/up24x9-minmax-fp32-neonv8-mul16.c",
     "src/qc8-dwconv/gen/up24x25-minmax-fp32-neonv8-mul16.c",