Neon shuffle GEMM and IGEMM kernels.

M1 is 7.1% faster on mobilenet_v2
M2 is 6.5% faster on mobilenet_v2

PiperOrigin-RevId: 281623279
diff --git a/src/init.c b/src/init.c
index 083b978..eb8220b 100644
--- a/src/init.c
+++ b/src/init.c
@@ -324,8 +324,6 @@
           break;
         case cpuinfo_uarch_cortex_a75:
         case cpuinfo_uarch_cortex_a76:
-        case cpuinfo_uarch_mongoose_m1:
-        case cpuinfo_uarch_mongoose_m2:
         case cpuinfo_uarch_meerkat_m3:
         case (cpuinfo_uarch_meerkat_m3 + 1):
           xnn_params.f32.gemm = (struct gemm_parameters) {
@@ -337,6 +335,20 @@
             .nr = 8,
           };
           break;
+
+        case cpuinfo_uarch_mongoose_m1:
+        case cpuinfo_uarch_mongoose_m2:
+          xnn_params.f32.gemm = (struct gemm_parameters) {
+            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8s4__neonfma,
+            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8s4__neonfma,
+            .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8s4__neonfma,
+            .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8s4__neonfma,
+            .mr = 6,
+            .nr = 8,
+            .log2_sr = 2,
+          };
+          break;
+
         case cpuinfo_uarch_cortex_a53:
         case cpuinfo_uarch_cortex_a55:
           xnn_params.f32.gemm = (struct gemm_parameters) {