Neon shuffle GEMM and IGEMM kernels.

M1 is 7.1% faster on mobilenet_v2
M2 is 6.5% faster on mobilenet_v2

PiperOrigin-RevId: 281623279
diff --git a/scripts/generate-f32-igemm.sh b/scripts/generate-f32-igemm.sh
index f11bbb9..48d7744 100755
--- a/scripts/generate-f32-igemm.sh
+++ b/scripts/generate-f32-igemm.sh
@@ -26,6 +26,15 @@
 ### MRx2 micro-kernels
 tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=0 -o src/f32-igemm/4x2-neon-ld64.c
 tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=1 -o src/f32-igemm/4x2-neonfma-ld64.c
+### LOAD4+PERMUTE micro-kernels
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -o src/f32-igemm/1x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=1 -o src/f32-igemm/1x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=0 -o src/f32-igemm/4x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=1 -o src/f32-igemm/4x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=0 -o src/f32-igemm/6x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=1 -o src/f32-igemm/6x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=0 -o src/f32-igemm/8x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=1 -o src/f32-igemm/8x8s4-neonfma.c
 
 #################################### PSIMD ####################################
 ### LOAD1+BROADCAST micro-kernels