FP32 LD128 IGEMM for Cortex X1
- 4x8 and 6x8
- ld128 main loop
- e2e benchmark
- init 6x8 ld128 for Cortex X1

PiperOrigin-RevId: 380701356
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a209fdf..ee2d3f2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3379,11 +3379,13 @@
   src/f32-igemm/gen/1x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
   src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-cortex-a75.S
   src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld64.S
+  src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-ld128.S
   src/f32-igemm/gen/4x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
   src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-cortex-a75.S
   src/f32-igemm/gen/5x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
   src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-cortex-a75.S
   src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld64.S
+  src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-ld128.S
   src/f32-igemm/gen/6x8-minmax-aarch64-neonfma-prfm-cortex-a75.S
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
   src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S