Neon shuffle GEMM and IGEMM kernels.

M1 is 7.1% faster on mobilenet_v2
M2 is 6.5% faster on mobilenet_v2

PiperOrigin-RevId: 281623279
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index 4e52c3d..ce7ea69 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -75,6 +75,14 @@
   k-block: 2
 - name: xnn_f32_gemminc_ukernel_6x8__neon_ld64
   k-block: 2
+- name: xnn_f32_gemminc_ukernel_1x8s4__neon
+  k-block: 4
+- name: xnn_f32_gemminc_ukernel_4x8s4__neon
+  k-block: 4
+- name: xnn_f32_gemminc_ukernel_6x8s4__neon
+  k-block: 4
+- name: xnn_f32_gemminc_ukernel_8x8s4__neon
+  k-block: 4
 - name: xnn_f32_gemminc_ukernel_1x8__neonfma_ld64
   k-block: 2
 - name: xnn_f32_gemminc_ukernel_4x8__neonfma_ld64
@@ -85,6 +93,14 @@
   k-block: 2
 - name: xnn_f32_gemminc_ukernel_6x8__neonfma_ld64
   k-block: 2
+- name: xnn_f32_gemminc_ukernel_1x8s4__neonfma
+  k-block: 4
+- name: xnn_f32_gemminc_ukernel_4x8s4__neonfma
+  k-block: 4
+- name: xnn_f32_gemminc_ukernel_6x8s4__neonfma
+  k-block: 4
+- name: xnn_f32_gemminc_ukernel_8x8s4__neonfma
+  k-block: 4
 - name: xnn_f32_gemminc_ukernel_1x8__sse_load1
   k-block: 1
 - name: xnn_f32_gemminc_ukernel_4x8__sse_load1