6x8 ld128 GEMM microkernels

12 new kernels
dup and lane
neon and neonfma
GEMM, GEMMINC and GEMM

PiperOrigin-RevId: 282661111
diff --git a/BUILD.bazel b/BUILD.bazel
index 7f2b614..7262d7b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -290,10 +290,12 @@
     "src/f32-gemm/gen/4x8-neon-lane-ld64.c",
     "src/f32-gemm/gen/5x8-neon-lane-ld64.c",
     "src/f32-gemm/gen/6x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen/6x8-neon-lane-ld128.c",
     "src/f32-gemm/gen/1x8-neon-dup-ld64.c",
     "src/f32-gemm/gen/4x8-neon-dup-ld128.c",
     "src/f32-gemm/gen/4x8-neon-dup-ld64.c",
     "src/f32-gemm/gen/6x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen/6x8-neon-dup-ld128.c",
     "src/f32-gemm/gen/1x8s4-neon.c",
     "src/f32-gemm/gen/4x8s4-neon.c",
     "src/f32-gemm/gen/6x8s4-neon.c",
@@ -303,10 +305,12 @@
     "src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c",
     "src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c",
     "src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neon-lane-ld128.c",
     "src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c",
     "src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c",
     "src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c",
     "src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neon-dup-ld128.c",
     "src/f32-gemm/gen-inc/1x8s4-neon.c",
     "src/f32-gemm/gen-inc/4x8s4-neon.c",
     "src/f32-gemm/gen-inc/6x8s4-neon.c",
@@ -318,10 +322,12 @@
     "src/f32-igemm/gen/4x8-neon-lane-ld128.c",
     "src/f32-igemm/gen/4x8-neon-lane-ld64.c",
     "src/f32-igemm/gen/6x8-neon-lane-ld64.c",
+    "src/f32-igemm/gen/6x8-neon-lane-ld128.c",
     "src/f32-igemm/gen/1x8-neon-dup-ld64.c",
     "src/f32-igemm/gen/4x8-neon-dup-ld128.c",
     "src/f32-igemm/gen/4x8-neon-dup-ld64.c",
     "src/f32-igemm/gen/6x8-neon-dup-ld64.c",
+    "src/f32-igemm/gen/6x8-neon-dup-ld128.c",
     "src/f32-igemm/gen/1x8s4-neon.c",
     "src/f32-igemm/gen/4x8s4-neon.c",
     "src/f32-igemm/gen/6x8s4-neon.c",
@@ -382,6 +388,7 @@
     "src/f32-igemm/gen/4x8-neonfma-dup-ld128.c",
     "src/f32-igemm/gen/4x8-neonfma-dup-ld64.c",
     "src/f32-igemm/gen/6x8-neonfma-dup-ld64.c",
+    "src/f32-igemm/gen/6x8-neonfma-dup-ld128.c",
     "src/f32-igemm/gen/1x8s4-neonfma.c",
     "src/f32-igemm/gen/4x8s4-neonfma.c",
     "src/f32-igemm/gen/6x8s4-neonfma.c",
@@ -394,6 +401,7 @@
     "src/f32-gemm/gen/4x8-neonfma-dup-ld128.c",
     "src/f32-gemm/gen/4x8-neonfma-dup-ld64.c",
     "src/f32-gemm/gen/6x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen/6x8-neonfma-dup-ld128.c",
     "src/f32-gemm/gen/1x8s4-neonfma.c",
     "src/f32-gemm/gen/4x8s4-neonfma.c",
     "src/f32-gemm/gen/6x8s4-neonfma.c",
@@ -402,6 +410,7 @@
     "src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c",
     "src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c",
     "src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neonfma-dup-ld128.c",
     "src/f32-gemm/gen-inc/1x8s4-neonfma.c",
     "src/f32-gemm/gen-inc/4x8s4-neonfma.c",
     "src/f32-gemm/gen-inc/6x8s4-neonfma.c",
@@ -429,17 +438,20 @@
     "src/f32-gemm/gen/4x8-neonfma-lane-ld64.c",
     "src/f32-gemm/gen/5x8-neonfma-lane-ld64.c",
     "src/f32-gemm/gen/6x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen/6x8-neonfma-lane-ld128.c",
     "src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c",
     "src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c",
     "src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c",
     "src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c",
     "src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neonfma-lane-ld128.c",
     "src/f32-igemm/gen/1x8-neonfma-lane-ld64.c",
     "src/f32-igemm/gen/4x2-neonfma-lane-ld64.c",
     "src/f32-igemm/gen/4x4-neonfma-lane-ld64.c",
     "src/f32-igemm/gen/4x8-neonfma-lane-ld128.c",
     "src/f32-igemm/gen/4x8-neonfma-lane-ld64.c",
     "src/f32-igemm/gen/6x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/gen/6x8-neonfma-lane-ld128.c",
     "src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c",
     "src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c",
     "src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c",