Move generated micro-kernels into a subdirectory

PiperOrigin-RevId: 282322486
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 675d500..e1f2e73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,86 +169,86 @@
   src/f32-argmaxpool/9x-scalar-c1.c
   src/f32-avgpool/mp9p8q-scalar.c
   src/f32-avgpool/up9-scalar.c
-  src/f32-bilinear/scalar-c1.c
-  src/f32-bilinear/scalar-c2.c
-  src/f32-bilinear/scalar-c4.c
+  src/f32-bilinear/gen/scalar-c1.c
+  src/f32-bilinear/gen/scalar-c2.c
+  src/f32-bilinear/gen/scalar-c4.c
   src/f32-clamp/scalar.c
   src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c
   src/f32-dwconv-spchw/3x3p1-scalar.c
   src/f32-dwconv-spchw/3x3s2p1-scalar.c
   src/f32-dwconv-spchw/5x5p2-scalar.c
   src/f32-dwconv-spchw/5x5s2p2-scalar.c
-  src/f32-dwconv/up1x25-scalar-acc2.c
-  src/f32-dwconv/up1x25-scalar.c
-  src/f32-dwconv/up1x4-scalar-acc2.c
-  src/f32-dwconv/up1x4-scalar.c
-  src/f32-dwconv/up1x9-scalar-acc2.c
-  src/f32-dwconv/up1x9-scalar.c
-  src/f32-dwconv/up2x25-scalar-acc2.c
-  src/f32-dwconv/up2x25-scalar.c
-  src/f32-dwconv/up2x4-scalar-acc2.c
-  src/f32-dwconv/up2x4-scalar.c
-  src/f32-dwconv/up2x9-scalar-acc2.c
-  src/f32-dwconv/up2x9-scalar.c
+  src/f32-dwconv/gen/up1x25-scalar-acc2.c
+  src/f32-dwconv/gen/up1x25-scalar.c
+  src/f32-dwconv/gen/up1x4-scalar-acc2.c
+  src/f32-dwconv/gen/up1x4-scalar.c
+  src/f32-dwconv/gen/up1x9-scalar-acc2.c
+  src/f32-dwconv/gen/up1x9-scalar.c
+  src/f32-dwconv/gen/up2x25-scalar-acc2.c
+  src/f32-dwconv/gen/up2x25-scalar.c
+  src/f32-dwconv/gen/up2x4-scalar-acc2.c
+  src/f32-dwconv/gen/up2x4-scalar.c
+  src/f32-dwconv/gen/up2x9-scalar-acc2.c
+  src/f32-dwconv/gen/up2x9-scalar.c
   src/f32-gavgpool-spchw/scalar-x1.c
   src/f32-gavgpool/mp7p7q-scalar.c
   src/f32-gavgpool/up7-scalar.c
-  src/f32-gemm/1x4-scalar.c
-  src/f32-gemm/2x4-scalar.c
-  src/f32-gemm/4x2-scalar.c
-  src/f32-gemm/4x4-scalar.c
-  src/f32-gemminc/1x4-scalar.c
-  src/f32-gemminc/2x4-scalar.c
-  src/f32-gemminc/4x4-scalar.c
+  src/f32-gemm/gen/1x4-scalar.c
+  src/f32-gemm/gen/2x4-scalar.c
+  src/f32-gemm/gen/4x2-scalar.c
+  src/f32-gemm/gen/4x4-scalar.c
+  src/f32-gemm/gen-inc/1x4-scalar.c
+  src/f32-gemm/gen-inc/2x4-scalar.c
+  src/f32-gemm/gen-inc/4x4-scalar.c
   src/f32-hswish/scalar.c
-  src/f32-igemm/1x4-scalar.c
-  src/f32-igemm/2x4-scalar.c
-  src/f32-igemm/4x2-scalar.c
-  src/f32-igemm/4x4-scalar.c
+  src/f32-igemm/gen/1x4-scalar.c
+  src/f32-igemm/gen/2x4-scalar.c
+  src/f32-igemm/gen/4x2-scalar.c
+  src/f32-igemm/gen/4x4-scalar.c
   src/f32-maxpool/9p8x-scalar-c1.c
   src/f32-pavgpool/mp9p8q-scalar.c
   src/f32-pavgpool/up9-scalar.c
-  src/f32-ppmm/2x4-scalar.c
-  src/f32-ppmm/3x3-scalar.c
-  src/f32-ppmm/4x2-scalar.c
-  src/f32-ppmm/4x4-scalar.c
-  src/f32-prelu/scalar-2x1.c
-  src/f32-prelu/scalar-2x4.c
+  src/f32-ppmm/gen/2x4-scalar.c
+  src/f32-ppmm/gen/3x3-scalar.c
+  src/f32-ppmm/gen/4x2-scalar.c
+  src/f32-ppmm/gen/4x4-scalar.c
+  src/f32-prelu/gen/scalar-2x1.c
+  src/f32-prelu/gen/scalar-2x4.c
   src/f32-rmax/scalar.c
-  src/f32-spmm/1x1-scalar-pipelined.c
-  src/f32-spmm/1x1-scalar.c
-  src/f32-spmm/2x1-scalar-pipelined.c
-  src/f32-spmm/2x1-scalar.c
-  src/f32-spmm/4x1-scalar-pipelined.c
-  src/f32-spmm/4x1-scalar.c
-  src/f32-spmm/8x1-scalar-pipelined.c
-  src/f32-spmm/8x1-scalar.c
-  src/f32-spmm/8x2-scalar.c
-  src/f32-spmm/8x4-scalar.c
-  src/f32-vbinary/vadd-scalar-x1.c
-  src/f32-vbinary/vadd-scalar-x2.c
-  src/f32-vbinary/vadd-scalar-x4.c
-  src/f32-vbinary/vaddc-scalar-x1.c
-  src/f32-vbinary/vaddc-scalar-x2.c
-  src/f32-vbinary/vaddc-scalar-x4.c
-  src/f32-vbinary/vmul-scalar-x1.c
-  src/f32-vbinary/vmul-scalar-x2.c
-  src/f32-vbinary/vmul-scalar-x4.c
-  src/f32-vbinary/vmulc-scalar-x1.c
-  src/f32-vbinary/vmulc-scalar-x2.c
-  src/f32-vbinary/vmulc-scalar-x4.c
-  src/f32-vbinary/vrsubc-scalar-x1.c
-  src/f32-vbinary/vrsubc-scalar-x2.c
-  src/f32-vbinary/vrsubc-scalar-x4.c
-  src/f32-vbinary/vsub-scalar-x1.c
-  src/f32-vbinary/vsub-scalar-x2.c
-  src/f32-vbinary/vsub-scalar-x4.c
-  src/f32-vbinary/vsubc-scalar-x1.c
-  src/f32-vbinary/vsubc-scalar-x2.c
-  src/f32-vbinary/vsubc-scalar-x4.c
-  src/f32-vmulcaddc/c1-scalar-2x.c
-  src/f32-vmulcaddc/c2-scalar-2x.c
-  src/f32-vmulcaddc/c4-scalar-2x.c
+  src/f32-spmm/gen/1x1-scalar-pipelined.c
+  src/f32-spmm/gen/1x1-scalar.c
+  src/f32-spmm/gen/2x1-scalar-pipelined.c
+  src/f32-spmm/gen/2x1-scalar.c
+  src/f32-spmm/gen/4x1-scalar-pipelined.c
+  src/f32-spmm/gen/4x1-scalar.c
+  src/f32-spmm/gen/8x1-scalar-pipelined.c
+  src/f32-spmm/gen/8x1-scalar.c
+  src/f32-spmm/gen/8x2-scalar.c
+  src/f32-spmm/gen/8x4-scalar.c
+  src/f32-vbinary/gen/vadd-scalar-x1.c
+  src/f32-vbinary/gen/vadd-scalar-x2.c
+  src/f32-vbinary/gen/vadd-scalar-x4.c
+  src/f32-vbinary/gen/vaddc-scalar-x1.c
+  src/f32-vbinary/gen/vaddc-scalar-x2.c
+  src/f32-vbinary/gen/vaddc-scalar-x4.c
+  src/f32-vbinary/gen/vmul-scalar-x1.c
+  src/f32-vbinary/gen/vmul-scalar-x2.c
+  src/f32-vbinary/gen/vmul-scalar-x4.c
+  src/f32-vbinary/gen/vmulc-scalar-x1.c
+  src/f32-vbinary/gen/vmulc-scalar-x2.c
+  src/f32-vbinary/gen/vmulc-scalar-x4.c
+  src/f32-vbinary/gen/vrsubc-scalar-x1.c
+  src/f32-vbinary/gen/vrsubc-scalar-x2.c
+  src/f32-vbinary/gen/vrsubc-scalar-x4.c
+  src/f32-vbinary/gen/vsub-scalar-x1.c
+  src/f32-vbinary/gen/vsub-scalar-x2.c
+  src/f32-vbinary/gen/vsub-scalar-x4.c
+  src/f32-vbinary/gen/vsubc-scalar-x1.c
+  src/f32-vbinary/gen/vsubc-scalar-x2.c
+  src/f32-vbinary/gen/vsubc-scalar-x4.c
+  src/f32-vmulcaddc/gen/c1-scalar-2x.c
+  src/f32-vmulcaddc/gen/c2-scalar-2x.c
+  src/f32-vmulcaddc/gen/c4-scalar-2x.c
   src/q8-avgpool/mp9p8q-scalar.c
   src/q8-avgpool/up9-scalar.c
   src/q8-dwconv/up1x9-scalar.c
@@ -282,74 +282,74 @@
   src/f32-argmaxpool/9x-psimd-c4.c
   src/f32-avgpool/mp9p8q-psimd.c
   src/f32-avgpool/up9-psimd.c
-  src/f32-bilinear/psimd-c4.c
-  src/f32-bilinear/psimd-c8.c
+  src/f32-bilinear/gen/psimd-c4.c
+  src/f32-bilinear/gen/psimd-c8.c
   src/f32-clamp/psimd.c
-  src/f32-dwconv/up4x25-psimd-acc2.c
-  src/f32-dwconv/up4x25-psimd.c
-  src/f32-dwconv/up4x4-psimd-acc2.c
-  src/f32-dwconv/up4x4-psimd.c
-  src/f32-dwconv/up4x9-psimd-acc2.c
-  src/f32-dwconv/up4x9-psimd.c
-  src/f32-dwconv/up8x25-psimd-acc2.c
-  src/f32-dwconv/up8x25-psimd.c
-  src/f32-dwconv/up8x4-psimd-acc2.c
-  src/f32-dwconv/up8x4-psimd.c
-  src/f32-dwconv/up8x9-psimd-acc2.c
-  src/f32-dwconv/up8x9-psimd.c
+  src/f32-dwconv/gen/up4x25-psimd-acc2.c
+  src/f32-dwconv/gen/up4x25-psimd.c
+  src/f32-dwconv/gen/up4x4-psimd-acc2.c
+  src/f32-dwconv/gen/up4x4-psimd.c
+  src/f32-dwconv/gen/up4x9-psimd-acc2.c
+  src/f32-dwconv/gen/up4x9-psimd.c
+  src/f32-dwconv/gen/up8x25-psimd-acc2.c
+  src/f32-dwconv/gen/up8x25-psimd.c
+  src/f32-dwconv/gen/up8x4-psimd-acc2.c
+  src/f32-dwconv/gen/up8x4-psimd.c
+  src/f32-dwconv/gen/up8x9-psimd-acc2.c
+  src/f32-dwconv/gen/up8x9-psimd.c
   src/f32-gavgpool/mp7p7q-psimd.c
   src/f32-gavgpool/up7-psimd.c
-  src/f32-gemm/1x8-psimd-loadsplat.c
-  src/f32-gemm/1x8-psimd-splat.c
-  src/f32-gemm/1x8s4-psimd.c
-  src/f32-gemm/4x8-psimd-loadsplat.c
-  src/f32-gemm/4x8-psimd-splat.c
-  src/f32-gemm/4x8s4-psimd.c
-  src/f32-gemm/6x8-psimd-loadsplat.c
-  src/f32-gemm/6x8-psimd-splat.c
-  src/f32-gemm/6x8s4-psimd.c
-  src/f32-gemminc/1x8-psimd-loadsplat.c
-  src/f32-gemminc/1x8-psimd-splat.c
-  src/f32-gemminc/1x8s4-psimd.c
-  src/f32-gemminc/4x8-psimd-loadsplat.c
-  src/f32-gemminc/4x8-psimd-splat.c
-  src/f32-gemminc/4x8s4-psimd.c
-  src/f32-gemminc/6x8-psimd-loadsplat.c
-  src/f32-gemminc/6x8-psimd-splat.c
-  src/f32-gemminc/6x8s4-psimd.c
+  src/f32-gemm/gen/1x8-psimd-loadsplat.c
+  src/f32-gemm/gen/1x8-psimd-splat.c
+  src/f32-gemm/gen/1x8s4-psimd.c
+  src/f32-gemm/gen/4x8-psimd-loadsplat.c
+  src/f32-gemm/gen/4x8-psimd-splat.c
+  src/f32-gemm/gen/4x8s4-psimd.c
+  src/f32-gemm/gen/6x8-psimd-loadsplat.c
+  src/f32-gemm/gen/6x8-psimd-splat.c
+  src/f32-gemm/gen/6x8s4-psimd.c
+  src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c
+  src/f32-gemm/gen-inc/1x8-psimd-splat.c
+  src/f32-gemm/gen-inc/1x8s4-psimd.c
+  src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c
+  src/f32-gemm/gen-inc/4x8-psimd-splat.c
+  src/f32-gemm/gen-inc/4x8s4-psimd.c
+  src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c
+  src/f32-gemm/gen-inc/6x8-psimd-splat.c
+  src/f32-gemm/gen-inc/6x8s4-psimd.c
   src/f32-hswish/psimd.c
-  src/f32-igemm/1x8-psimd-loadsplat.c
-  src/f32-igemm/1x8-psimd-splat.c
-  src/f32-igemm/1x8s4-psimd.c
-  src/f32-igemm/4x2c4-psimd.c
-  src/f32-igemm/4x8-psimd-loadsplat.c
-  src/f32-igemm/4x8-psimd-splat.c
-  src/f32-igemm/4x8s4-psimd.c
-  src/f32-igemm/6x8-psimd-loadsplat.c
-  src/f32-igemm/6x8-psimd-splat.c
-  src/f32-igemm/6x8s4-psimd.c
+  src/f32-igemm/gen/1x8-psimd-loadsplat.c
+  src/f32-igemm/gen/1x8-psimd-splat.c
+  src/f32-igemm/gen/1x8s4-psimd.c
+  src/f32-igemm/gen/4x2c4-psimd.c
+  src/f32-igemm/gen/4x8-psimd-loadsplat.c
+  src/f32-igemm/gen/4x8-psimd-splat.c
+  src/f32-igemm/gen/4x8s4-psimd.c
+  src/f32-igemm/gen/6x8-psimd-loadsplat.c
+  src/f32-igemm/gen/6x8-psimd-splat.c
+  src/f32-igemm/gen/6x8s4-psimd.c
   src/f32-maxpool/9p8x-psimd-c4.c
   src/f32-pavgpool/mp9p8q-psimd.c
   src/f32-pavgpool/up9-psimd.c
-  src/f32-ppmm/4x8-psimd.c
-  src/f32-prelu/psimd-2x4.c
-  src/f32-prelu/psimd-2x8.c
-  src/f32-vbinary/vadd-psimd-x4.c
-  src/f32-vbinary/vadd-psimd-x8.c
-  src/f32-vbinary/vaddc-psimd-x4.c
-  src/f32-vbinary/vaddc-psimd-x8.c
-  src/f32-vbinary/vmul-psimd-x4.c
-  src/f32-vbinary/vmul-psimd-x8.c
-  src/f32-vbinary/vmulc-psimd-x4.c
-  src/f32-vbinary/vmulc-psimd-x8.c
-  src/f32-vbinary/vrsubc-psimd-x4.c
-  src/f32-vbinary/vrsubc-psimd-x8.c
-  src/f32-vbinary/vsub-psimd-x4.c
-  src/f32-vbinary/vsub-psimd-x8.c
-  src/f32-vbinary/vsubc-psimd-x4.c
-  src/f32-vbinary/vsubc-psimd-x8.c
-  src/f32-vmulcaddc/c4-psimd-2x.c
-  src/f32-vmulcaddc/c8-psimd-2x.c
+  src/f32-ppmm/gen/4x8-psimd.c
+  src/f32-prelu/gen/psimd-2x4.c
+  src/f32-prelu/gen/psimd-2x8.c
+  src/f32-vbinary/gen/vadd-psimd-x4.c
+  src/f32-vbinary/gen/vadd-psimd-x8.c
+  src/f32-vbinary/gen/vaddc-psimd-x4.c
+  src/f32-vbinary/gen/vaddc-psimd-x8.c
+  src/f32-vbinary/gen/vmul-psimd-x4.c
+  src/f32-vbinary/gen/vmul-psimd-x8.c
+  src/f32-vbinary/gen/vmulc-psimd-x4.c
+  src/f32-vbinary/gen/vmulc-psimd-x8.c
+  src/f32-vbinary/gen/vrsubc-psimd-x4.c
+  src/f32-vbinary/gen/vrsubc-psimd-x8.c
+  src/f32-vbinary/gen/vsub-psimd-x4.c
+  src/f32-vbinary/gen/vsub-psimd-x8.c
+  src/f32-vbinary/gen/vsubc-psimd-x4.c
+  src/f32-vbinary/gen/vsubc-psimd-x8.c
+  src/f32-vmulcaddc/gen/c4-psimd-2x.c
+  src/f32-vmulcaddc/gen/c8-psimd-2x.c
   src/x32-packx/x4-psimd.c
   src/x32-pad/x2-psimd.c
   src/x32-unpool/psimd.c
@@ -361,82 +361,82 @@
 SET(XNNPACK_NEON_MICROKERNEL_SRCS
   src/f32-avgpool/mp9p8q-neon.c
   src/f32-avgpool/up9-neon.c
-  src/f32-bilinear/neon-c4.c
-  src/f32-bilinear/neon-c8.c
+  src/f32-bilinear/gen/neon-c4.c
+  src/f32-bilinear/gen/neon-c8.c
   src/f32-clamp/neon.c
-  src/f32-dwconv/up4x9-neon.c
-  src/f32-dwconv/up4x9-neon-acc2.c
-  src/f32-dwconv/up8x9-neon.c
-  src/f32-dwconv/up8x9-neon-acc2.c
+  src/f32-dwconv/gen/up4x9-neon.c
+  src/f32-dwconv/gen/up4x9-neon-acc2.c
+  src/f32-dwconv/gen/up8x9-neon.c
+  src/f32-dwconv/gen/up8x9-neon-acc2.c
   src/f32-gavgpool-spchw/neon-x4.c
   src/f32-gavgpool/mp7p7q-neon.c
   src/f32-gavgpool/up7-neon.c
-  src/f32-gemm/1x8-neon-lane-ld64.c
-  src/f32-gemm/4x2-neon-lane-ld64.c
-  src/f32-gemm/4x8-neon-lane-ld128.c
-  src/f32-gemm/4x8-neon-lane-ld64.c
-  src/f32-gemm/5x8-neon-lane-ld64.c
-  src/f32-gemm/6x8-neon-lane-ld64.c
-  src/f32-gemm/1x8-neon-dup-ld64.c
-  src/f32-gemm/4x8-neon-dup-ld128.c
-  src/f32-gemm/4x8-neon-dup-ld64.c
-  src/f32-gemm/6x8-neon-dup-ld64.c
-  src/f32-gemm/1x8s4-neon.c
-  src/f32-gemm/4x8s4-neon.c
-  src/f32-gemm/6x8s4-neon.c
-  src/f32-gemm/8x8s4-neon.c
-  src/f32-gemminc/1x8-neon-lane-ld64.c
-  src/f32-gemminc/4x8-neon-lane-ld128.c
-  src/f32-gemminc/4x8-neon-lane-ld64.c
-  src/f32-gemminc/5x8-neon-lane-ld64.c
-  src/f32-gemminc/6x8-neon-lane-ld64.c
-  src/f32-gemminc/1x8-neon-dup-ld64.c
-  src/f32-gemminc/4x8-neon-dup-ld128.c
-  src/f32-gemminc/4x8-neon-dup-ld64.c
-  src/f32-gemminc/6x8-neon-dup-ld64.c
-  src/f32-gemminc/1x8s4-neon.c
-  src/f32-gemminc/4x8s4-neon.c
-  src/f32-gemminc/6x8s4-neon.c
-  src/f32-gemminc/8x8s4-neon.c
+  src/f32-gemm/gen/1x8-neon-lane-ld64.c
+  src/f32-gemm/gen/4x2-neon-lane-ld64.c
+  src/f32-gemm/gen/4x8-neon-lane-ld128.c
+  src/f32-gemm/gen/4x8-neon-lane-ld64.c
+  src/f32-gemm/gen/5x8-neon-lane-ld64.c
+  src/f32-gemm/gen/6x8-neon-lane-ld64.c
+  src/f32-gemm/gen/1x8-neon-dup-ld64.c
+  src/f32-gemm/gen/4x8-neon-dup-ld128.c
+  src/f32-gemm/gen/4x8-neon-dup-ld64.c
+  src/f32-gemm/gen/6x8-neon-dup-ld64.c
+  src/f32-gemm/gen/1x8s4-neon.c
+  src/f32-gemm/gen/4x8s4-neon.c
+  src/f32-gemm/gen/6x8s4-neon.c
+  src/f32-gemm/gen/8x8s4-neon.c
+  src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c
+  src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c
+  src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c
+  src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c
+  src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c
+  src/f32-gemm/gen-inc/1x8s4-neon.c
+  src/f32-gemm/gen-inc/4x8s4-neon.c
+  src/f32-gemm/gen-inc/6x8s4-neon.c
+  src/f32-gemm/gen-inc/8x8s4-neon.c
   src/f32-hswish/neon.c
-  src/f32-igemm/1x8-neon-lane-ld64.c
-  src/f32-igemm/4x2-neon-lane-ld64.c
-  src/f32-igemm/4x4-neon-lane-ld64.c
-  src/f32-igemm/4x8-neon-lane-ld128.c
-  src/f32-igemm/4x8-neon-lane-ld64.c
-  src/f32-igemm/6x8-neon-lane-ld64.c
-  src/f32-igemm/1x8-neon-dup-ld64.c
-  src/f32-igemm/4x8-neon-dup-ld128.c
-  src/f32-igemm/4x8-neon-dup-ld64.c
-  src/f32-igemm/6x8-neon-dup-ld64.c
-  src/f32-igemm/1x8s4-neon.c
-  src/f32-igemm/4x8s4-neon.c
-  src/f32-igemm/6x8s4-neon.c
-  src/f32-igemm/8x8s4-neon.c
+  src/f32-igemm/gen/1x8-neon-lane-ld64.c
+  src/f32-igemm/gen/4x2-neon-lane-ld64.c
+  src/f32-igemm/gen/4x4-neon-lane-ld64.c
+  src/f32-igemm/gen/4x8-neon-lane-ld128.c
+  src/f32-igemm/gen/4x8-neon-lane-ld64.c
+  src/f32-igemm/gen/6x8-neon-lane-ld64.c
+  src/f32-igemm/gen/1x8-neon-dup-ld64.c
+  src/f32-igemm/gen/4x8-neon-dup-ld128.c
+  src/f32-igemm/gen/4x8-neon-dup-ld64.c
+  src/f32-igemm/gen/6x8-neon-dup-ld64.c
+  src/f32-igemm/gen/1x8s4-neon.c
+  src/f32-igemm/gen/4x8s4-neon.c
+  src/f32-igemm/gen/6x8s4-neon.c
+  src/f32-igemm/gen/8x8s4-neon.c
   src/f32-pavgpool/mp9p8q-neon.c
   src/f32-pavgpool/up9-neon.c
-  src/f32-ppmm/4x8-neon.c
-  src/f32-ppmm/8x8-neon.c
-  src/f32-prelu/neon-2x4.c
-  src/f32-prelu/neon-2x8.c
+  src/f32-ppmm/gen/4x8-neon.c
+  src/f32-ppmm/gen/8x8-neon.c
+  src/f32-prelu/gen/neon-2x4.c
+  src/f32-prelu/gen/neon-2x8.c
   src/f32-rmax/neon.c
-  src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
-  src/f32-vbinary/vadd-neon-x4.c
-  src/f32-vbinary/vadd-neon-x8.c
-  src/f32-vbinary/vaddc-neon-x4.c
-  src/f32-vbinary/vaddc-neon-x8.c
-  src/f32-vbinary/vmul-neon-x4.c
-  src/f32-vbinary/vmul-neon-x8.c
-  src/f32-vbinary/vmulc-neon-x4.c
-  src/f32-vbinary/vmulc-neon-x8.c
-  src/f32-vbinary/vrsubc-neon-x4.c
-  src/f32-vbinary/vrsubc-neon-x8.c
-  src/f32-vbinary/vsub-neon-x4.c
-  src/f32-vbinary/vsub-neon-x8.c
-  src/f32-vbinary/vsubc-neon-x4.c
-  src/f32-vbinary/vsubc-neon-x8.c
-  src/f32-vmulcaddc/c4-neon-2x.c
-  src/f32-vmulcaddc/c8-neon-2x.c
+  src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
+  src/f32-vbinary/gen/vadd-neon-x4.c
+  src/f32-vbinary/gen/vadd-neon-x8.c
+  src/f32-vbinary/gen/vaddc-neon-x4.c
+  src/f32-vbinary/gen/vaddc-neon-x8.c
+  src/f32-vbinary/gen/vmul-neon-x4.c
+  src/f32-vbinary/gen/vmul-neon-x8.c
+  src/f32-vbinary/gen/vmulc-neon-x4.c
+  src/f32-vbinary/gen/vmulc-neon-x8.c
+  src/f32-vbinary/gen/vrsubc-neon-x4.c
+  src/f32-vbinary/gen/vrsubc-neon-x8.c
+  src/f32-vbinary/gen/vsub-neon-x4.c
+  src/f32-vbinary/gen/vsub-neon-x8.c
+  src/f32-vbinary/gen/vsubc-neon-x4.c
+  src/f32-vbinary/gen/vsubc-neon-x8.c
+  src/f32-vmulcaddc/gen/c4-neon-2x.c
+  src/f32-vmulcaddc/gen/c8-neon-2x.c
   src/q8-avgpool/mp9p8q-neon.c
   src/q8-avgpool/up9-neon.c
   src/q8-dwconv/up8x9-neon.c
@@ -462,42 +462,42 @@
   src/x8-zip/xm-neon.c)
 
 SET(XNNPACK_NEONFMA_MICROKERNEL_SRCS
-  src/f32-bilinear/neonfma-c4.c
-  src/f32-bilinear/neonfma-c8.c
-  src/f32-igemm/1x8-neonfma-dup-ld64.c
-  src/f32-igemm/4x8-neonfma-dup-ld128.c
-  src/f32-igemm/4x8-neonfma-dup-ld64.c
-  src/f32-igemm/6x8-neonfma-dup-ld64.c
-  src/f32-igemm/1x8s4-neonfma.c
-  src/f32-igemm/4x8s4-neonfma.c
-  src/f32-igemm/6x8s4-neonfma.c
-  src/f32-igemm/8x8s4-neonfma.c
-  src/f32-dwconv/up4x9-neonfma.c
-  src/f32-dwconv/up4x9-neonfma-acc2.c
-  src/f32-dwconv/up8x9-neonfma.c
-  src/f32-dwconv/up8x9-neonfma-acc2.c
-  src/f32-gemm/1x8-neonfma-dup-ld64.c
-  src/f32-gemm/4x8-neonfma-dup-ld128.c
-  src/f32-gemm/4x8-neonfma-dup-ld64.c
-  src/f32-gemm/6x8-neonfma-dup-ld64.c
-  src/f32-gemm/1x8s4-neonfma.c
-  src/f32-gemm/4x8s4-neonfma.c
-  src/f32-gemm/6x8s4-neonfma.c
-  src/f32-gemm/8x8s4-neonfma.c
-  src/f32-gemminc/1x8-neonfma-dup-ld64.c
-  src/f32-gemminc/4x8-neonfma-dup-ld128.c
-  src/f32-gemminc/4x8-neonfma-dup-ld64.c
-  src/f32-gemminc/6x8-neonfma-dup-ld64.c
-  src/f32-gemminc/1x8s4-neonfma.c
-  src/f32-gemminc/4x8s4-neonfma.c
-  src/f32-gemminc/6x8s4-neonfma.c
-  src/f32-gemminc/8x8s4-neonfma.c
+  src/f32-bilinear/gen/neonfma-c4.c
+  src/f32-bilinear/gen/neonfma-c8.c
+  src/f32-igemm/gen/1x8-neonfma-dup-ld64.c
+  src/f32-igemm/gen/4x8-neonfma-dup-ld128.c
+  src/f32-igemm/gen/4x8-neonfma-dup-ld64.c
+  src/f32-igemm/gen/6x8-neonfma-dup-ld64.c
+  src/f32-igemm/gen/1x8s4-neonfma.c
+  src/f32-igemm/gen/4x8s4-neonfma.c
+  src/f32-igemm/gen/6x8s4-neonfma.c
+  src/f32-igemm/gen/8x8s4-neonfma.c
+  src/f32-dwconv/gen/up4x9-neonfma.c
+  src/f32-dwconv/gen/up4x9-neonfma-acc2.c
+  src/f32-dwconv/gen/up8x9-neonfma.c
+  src/f32-dwconv/gen/up8x9-neonfma-acc2.c
+  src/f32-gemm/gen/1x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen/4x8-neonfma-dup-ld128.c
+  src/f32-gemm/gen/4x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen/6x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen/1x8s4-neonfma.c
+  src/f32-gemm/gen/4x8s4-neonfma.c
+  src/f32-gemm/gen/6x8s4-neonfma.c
+  src/f32-gemm/gen/8x8s4-neonfma.c
+  src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c
+  src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen-inc/1x8s4-neonfma.c
+  src/f32-gemm/gen-inc/4x8s4-neonfma.c
+  src/f32-gemm/gen-inc/6x8s4-neonfma.c
+  src/f32-gemm/gen-inc/8x8s4-neonfma.c
   src/f32-hswish/neonfma.c
-  src/f32-ppmm/4x8-neonfma.c
-  src/f32-ppmm/8x8-neonfma.c
-  src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
-  src/f32-vmulcaddc/c4-neonfma-2x.c
-  src/f32-vmulcaddc/c8-neonfma-2x.c
+  src/f32-ppmm/gen/4x8-neonfma.c
+  src/f32-ppmm/gen/8x8-neonfma.c
+  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
+  src/f32-vmulcaddc/gen/c4-neonfma-2x.c
+  src/f32-vmulcaddc/gen/c8-neonfma-2x.c
   src/math/exp-neonfma-lut64-p2.c
   src/math/exp-neonfma-p5.c
   src/math/expminus-neonfma-p5.c
@@ -506,23 +506,23 @@
   src/math/sigmoid-neonfma-p5-nr2recps.c)
 
 SET(XNNPACK_AARCH64_NEONFMA_MICROKERNEL_SRCS
-  src/f32-gemm/1x8-neonfma-lane-ld64.c
-  src/f32-gemm/4x2-neonfma-lane-ld64.c
-  src/f32-gemm/4x8-neonfma-lane-ld128.c
-  src/f32-gemm/4x8-neonfma-lane-ld64.c
-  src/f32-gemm/5x8-neonfma-lane-ld64.c
-  src/f32-gemm/6x8-neonfma-lane-ld64.c
-  src/f32-gemminc/1x8-neonfma-lane-ld64.c
-  src/f32-gemminc/4x8-neonfma-lane-ld128.c
-  src/f32-gemminc/4x8-neonfma-lane-ld64.c
-  src/f32-gemminc/5x8-neonfma-lane-ld64.c
-  src/f32-gemminc/6x8-neonfma-lane-ld64.c
-  src/f32-igemm/1x8-neonfma-lane-ld64.c
-  src/f32-igemm/4x2-neonfma-lane-ld64.c
-  src/f32-igemm/4x4-neonfma-lane-ld64.c
-  src/f32-igemm/4x8-neonfma-lane-ld128.c
-  src/f32-igemm/4x8-neonfma-lane-ld64.c
-  src/f32-igemm/6x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/1x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/4x2-neonfma-lane-ld64.c
+  src/f32-gemm/gen/4x8-neonfma-lane-ld128.c
+  src/f32-gemm/gen/4x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/5x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/6x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c
+  src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c
+  src/f32-igemm/gen/1x8-neonfma-lane-ld64.c
+  src/f32-igemm/gen/4x2-neonfma-lane-ld64.c
+  src/f32-igemm/gen/4x4-neonfma-lane-ld64.c
+  src/f32-igemm/gen/4x8-neonfma-lane-ld128.c
+  src/f32-igemm/gen/4x8-neonfma-lane-ld64.c
+  src/f32-igemm/gen/6x8-neonfma-lane-ld64.c
   src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c
   src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c
   src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
@@ -530,107 +530,107 @@
   src/f32-dwconv-spchw/5x5p2-neonfma.c
   src/f32-dwconv-spchw/3x3s2p1-neonfma.c
   src/f32-dwconv-spchw/5x5s2p2-neonfma.c
-  src/f32-spmm/12x1-neonfma.c
-  src/f32-spmm/12x2-neonfma.c
-  src/f32-spmm/12x4-neonfma.c
-  src/f32-spmm/16x1-neonfma-pipelined.c
-  src/f32-spmm/16x1-neonfma-unroll2.c
-  src/f32-spmm/16x1-neonfma.c
-  src/f32-spmm/16x2-neonfma.c
-  src/f32-spmm/16x4-neonfma.c
-  src/f32-spmm/4x1-neonfma-pipelined.c
-  src/f32-spmm/4x1-neonfma-unroll2.c
-  src/f32-spmm/4x1-neonfma.c
-  src/f32-spmm/4x2-neonfma.c
-  src/f32-spmm/4x4-neonfma.c
-  src/f32-spmm/8x1-neonfma-pipelined.c
-  src/f32-spmm/8x1-neonfma-unroll2.c
-  src/f32-spmm/8x1-neonfma.c
-  src/f32-spmm/8x2-neonfma.c
-  src/f32-spmm/8x4-neonfma.c
+  src/f32-spmm/gen/12x1-neonfma.c
+  src/f32-spmm/gen/12x2-neonfma.c
+  src/f32-spmm/gen/12x4-neonfma.c
+  src/f32-spmm/gen/16x1-neonfma-pipelined.c
+  src/f32-spmm/gen/16x1-neonfma-unroll2.c
+  src/f32-spmm/gen/16x1-neonfma.c
+  src/f32-spmm/gen/16x2-neonfma.c
+  src/f32-spmm/gen/16x4-neonfma.c
+  src/f32-spmm/gen/4x1-neonfma-pipelined.c
+  src/f32-spmm/gen/4x1-neonfma-unroll2.c
+  src/f32-spmm/gen/4x1-neonfma.c
+  src/f32-spmm/gen/4x2-neonfma.c
+  src/f32-spmm/gen/4x4-neonfma.c
+  src/f32-spmm/gen/8x1-neonfma-pipelined.c
+  src/f32-spmm/gen/8x1-neonfma-unroll2.c
+  src/f32-spmm/gen/8x1-neonfma.c
+  src/f32-spmm/gen/8x2-neonfma.c
+  src/f32-spmm/gen/8x4-neonfma.c
   src/math/sigmoid-neonfma-p5-div.c)
 
 SET(XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
-  src/f16-gemm/4x8-neonfp16arith-ld64.c
-  src/f16-gemm/6x8-neonfp16arith-ld64.c
-  src/f16-gemm/8x8-neonfp16arith-ld64.c)
+  src/f16-gemm/gen/4x8-neonfp16arith-ld64.c
+  src/f16-gemm/gen/6x8-neonfp16arith-ld64.c
+  src/f16-gemm/gen/8x8-neonfp16arith-ld64.c)
 
 SET(XNNPACK_SSE_MICROKERNEL_SRCS
   src/f32-avgpool/mp9p8q-sse.c
   src/f32-avgpool/up9-sse.c
-  src/f32-bilinear/sse-c4.c
-  src/f32-bilinear/sse-c8.c
+  src/f32-bilinear/gen/sse-c4.c
+  src/f32-bilinear/gen/sse-c8.c
   src/f32-clamp/sse.c
   src/f32-dwconv-spchw/3x3p1-sse.c
   src/f32-dwconv-spchw/3x3s2p1-sse.c
-  src/f32-dwconv/up4x25-sse-acc2.c
-  src/f32-dwconv/up4x25-sse.c
-  src/f32-dwconv/up4x4-sse-acc2.c
-  src/f32-dwconv/up4x4-sse.c
-  src/f32-dwconv/up4x9-sse-acc2.c
-  src/f32-dwconv/up4x9-sse.c
-  src/f32-dwconv/up8x25-sse-acc2.c
-  src/f32-dwconv/up8x25-sse.c
-  src/f32-dwconv/up8x4-sse-acc2.c
-  src/f32-dwconv/up8x4-sse.c
-  src/f32-dwconv/up8x9-sse-acc2.c
-  src/f32-dwconv/up8x9-sse.c
+  src/f32-dwconv/gen/up4x25-sse-acc2.c
+  src/f32-dwconv/gen/up4x25-sse.c
+  src/f32-dwconv/gen/up4x4-sse-acc2.c
+  src/f32-dwconv/gen/up4x4-sse.c
+  src/f32-dwconv/gen/up4x9-sse-acc2.c
+  src/f32-dwconv/gen/up4x9-sse.c
+  src/f32-dwconv/gen/up8x25-sse-acc2.c
+  src/f32-dwconv/gen/up8x25-sse.c
+  src/f32-dwconv/gen/up8x4-sse-acc2.c
+  src/f32-dwconv/gen/up8x4-sse.c
+  src/f32-dwconv/gen/up8x9-sse-acc2.c
+  src/f32-dwconv/gen/up8x9-sse.c
   src/f32-gavgpool-spchw/sse-x4.c
   src/f32-gavgpool/mp7p7q-sse.c
   src/f32-gavgpool/up7-sse.c
-  src/f32-gemm/1x8-sse-dup.c
-  src/f32-gemm/1x8-sse-load1.c
-  src/f32-gemm/1x8s4-sse.c
-  src/f32-gemm/4x8-sse-dup.c
-  src/f32-gemm/4x8-sse-load1.c
-  src/f32-gemm/4x8s4-sse.c
-  src/f32-gemminc/1x8-sse-dup.c
-  src/f32-gemminc/1x8-sse-load1.c
-  src/f32-gemminc/1x8s4-sse.c
-  src/f32-gemminc/4x8-sse-dup.c
-  src/f32-gemminc/4x8-sse-load1.c
-  src/f32-gemminc/4x8s4-sse.c
+  src/f32-gemm/gen/1x8-sse-dup.c
+  src/f32-gemm/gen/1x8-sse-load1.c
+  src/f32-gemm/gen/1x8s4-sse.c
+  src/f32-gemm/gen/4x8-sse-dup.c
+  src/f32-gemm/gen/4x8-sse-load1.c
+  src/f32-gemm/gen/4x8s4-sse.c
+  src/f32-gemm/gen-inc/1x8-sse-dup.c
+  src/f32-gemm/gen-inc/1x8-sse-load1.c
+  src/f32-gemm/gen-inc/1x8s4-sse.c
+  src/f32-gemm/gen-inc/4x8-sse-dup.c
+  src/f32-gemm/gen-inc/4x8-sse-load1.c
+  src/f32-gemm/gen-inc/4x8s4-sse.c
   src/f32-hswish/sse.c
-  src/f32-igemm/1x8-sse-dup.c
-  src/f32-igemm/1x8-sse-load1.c
-  src/f32-igemm/1x8s4-sse.c
-  src/f32-igemm/4x2c4-sse.c
-  src/f32-igemm/4x8-sse-dup.c
-  src/f32-igemm/4x8-sse-load1.c
-  src/f32-igemm/4x8s4-sse.c
+  src/f32-igemm/gen/1x8-sse-dup.c
+  src/f32-igemm/gen/1x8-sse-load1.c
+  src/f32-igemm/gen/1x8s4-sse.c
+  src/f32-igemm/gen/4x2c4-sse.c
+  src/f32-igemm/gen/4x8-sse-dup.c
+  src/f32-igemm/gen/4x8-sse-load1.c
+  src/f32-igemm/gen/4x8s4-sse.c
   src/f32-maxpool/9p8x-sse-c4.c
   src/f32-pavgpool/mp9p8q-sse.c
   src/f32-pavgpool/up9-sse.c
-  src/f32-ppmm/4x8-sse.c
+  src/f32-ppmm/gen/4x8-sse.c
   src/f32-rmax/sse.c
-  src/f32-spmm/4x1-sse.c
-  src/f32-spmm/8x1-sse.c
-  src/f32-vbinary/vadd-sse-x4.c
-  src/f32-vbinary/vadd-sse-x8.c
-  src/f32-vbinary/vaddc-sse-x4.c
-  src/f32-vbinary/vaddc-sse-x8.c
-  src/f32-vbinary/vmul-sse-x4.c
-  src/f32-vbinary/vmul-sse-x8.c
-  src/f32-vbinary/vmulc-sse-x4.c
-  src/f32-vbinary/vmulc-sse-x8.c
-  src/f32-vbinary/vrsubc-sse-x4.c
-  src/f32-vbinary/vrsubc-sse-x8.c
-  src/f32-vbinary/vsub-sse-x4.c
-  src/f32-vbinary/vsub-sse-x8.c
-  src/f32-vbinary/vsubc-sse-x4.c
-  src/f32-vbinary/vsubc-sse-x8.c
-  src/f32-vmulcaddc/c4-sse-2x.c
-  src/f32-vmulcaddc/c8-sse-2x.c
+  src/f32-spmm/gen/4x1-sse.c
+  src/f32-spmm/gen/8x1-sse.c
+  src/f32-vbinary/gen/vadd-sse-x4.c
+  src/f32-vbinary/gen/vadd-sse-x8.c
+  src/f32-vbinary/gen/vaddc-sse-x4.c
+  src/f32-vbinary/gen/vaddc-sse-x8.c
+  src/f32-vbinary/gen/vmul-sse-x4.c
+  src/f32-vbinary/gen/vmul-sse-x8.c
+  src/f32-vbinary/gen/vmulc-sse-x4.c
+  src/f32-vbinary/gen/vmulc-sse-x8.c
+  src/f32-vbinary/gen/vrsubc-sse-x4.c
+  src/f32-vbinary/gen/vrsubc-sse-x8.c
+  src/f32-vbinary/gen/vsub-sse-x4.c
+  src/f32-vbinary/gen/vsub-sse-x8.c
+  src/f32-vbinary/gen/vsubc-sse-x4.c
+  src/f32-vbinary/gen/vsubc-sse-x8.c
+  src/f32-vmulcaddc/gen/c4-sse-2x.c
+  src/f32-vmulcaddc/gen/c8-sse-2x.c
   src/x32-packx/x4-sse.c)
 
 SET(XNNPACK_SSE2_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-sse2-c4.c
   src/f32-argmaxpool/9p8x-sse2-c4.c
   src/f32-argmaxpool/9x-sse2-c4.c
-  src/f32-prelu/sse2-2x4.c
-  src/f32-prelu/sse2-2x8.c
-  src/f32-sigmoid/sse2-p5-div-x8.c
-  src/f32-sigmoid/sse2-p5-div-x16.c
+  src/f32-prelu/gen/sse2-2x4.c
+  src/f32-prelu/gen/sse2-2x8.c
+  src/f32-sigmoid/gen/sse2-p5-div-x8.c
+  src/f32-sigmoid/gen/sse2-p5-div-x16.c
   src/q8-avgpool/mp9p8q-sse2.c
   src/q8-avgpool/up9-sse2.c
   src/q8-igemm/4x4c2-sse2.c
@@ -657,73 +657,73 @@
   src/math/sigmoid-sse2-p5-div.c)
 
 SET(XNNPACK_SSE41_MICROKERNEL_SRCS
-  src/f32-prelu/sse41-2x4.c
-  src/f32-prelu/sse41-2x8.c
-  src/f32-sigmoid/sse41-p5-div-x8.c
-  src/f32-sigmoid/sse41-p5-div-x16.c)
+  src/f32-prelu/gen/sse41-2x4.c
+  src/f32-prelu/gen/sse41-2x8.c
+  src/f32-sigmoid/gen/sse41-p5-div-x8.c
+  src/f32-sigmoid/gen/sse41-p5-div-x16.c)
 
 SET(XNNPACK_AVX_MICROKERNEL_SRCS
-  src/f32-dwconv/up16x4-avx-acc2.c
-  src/f32-dwconv/up16x4-avx.c
-  src/f32-dwconv/up8x4-avx-acc2.c
-  src/f32-dwconv/up8x4-avx.c
-  src/f32-dwconv/up16x9-avx-acc2.c
-  src/f32-dwconv/up16x9-avx.c
-  src/f32-dwconv/up8x9-avx-acc2.c
-  src/f32-dwconv/up8x9-avx.c
-  src/f32-dwconv/up16x25-avx-acc2.c
-  src/f32-dwconv/up16x25-avx.c
-  src/f32-dwconv/up8x25-avx-acc2.c
-  src/f32-dwconv/up8x25-avx.c
-  src/f32-gemm/1x8-avx-broadcast.c
-  src/f32-gemm/4x8-avx-broadcast.c
-  src/f32-gemm/5x8-avx-broadcast.c
-  src/f32-gemm/6x8-avx-broadcast.c
-  src/f32-gemm/7x8-avx-broadcast.c
-  src/f32-gemminc/1x8-avx-broadcast.c
-  src/f32-gemminc/4x8-avx-broadcast.c
-  src/f32-gemminc/5x8-avx-broadcast.c
-  src/f32-gemminc/6x8-avx-broadcast.c
-  src/f32-gemminc/7x8-avx-broadcast.c
-  src/f32-igemm/1x8-avx-broadcast.c
-  src/f32-igemm/4x8-avx-broadcast.c
-  src/f32-igemm/5x8-avx-broadcast.c
-  src/f32-igemm/6x8-avx-broadcast.c
-  src/f32-igemm/7x8-avx-broadcast.c
+  src/f32-dwconv/gen/up16x4-avx-acc2.c
+  src/f32-dwconv/gen/up16x4-avx.c
+  src/f32-dwconv/gen/up8x4-avx-acc2.c
+  src/f32-dwconv/gen/up8x4-avx.c
+  src/f32-dwconv/gen/up16x9-avx-acc2.c
+  src/f32-dwconv/gen/up16x9-avx.c
+  src/f32-dwconv/gen/up8x9-avx-acc2.c
+  src/f32-dwconv/gen/up8x9-avx.c
+  src/f32-dwconv/gen/up16x25-avx-acc2.c
+  src/f32-dwconv/gen/up16x25-avx.c
+  src/f32-dwconv/gen/up8x25-avx-acc2.c
+  src/f32-dwconv/gen/up8x25-avx.c
+  src/f32-gemm/gen/1x8-avx-broadcast.c
+  src/f32-gemm/gen/4x8-avx-broadcast.c
+  src/f32-gemm/gen/5x8-avx-broadcast.c
+  src/f32-gemm/gen/6x8-avx-broadcast.c
+  src/f32-gemm/gen/7x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/1x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/4x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/5x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/6x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/7x8-avx-broadcast.c
+  src/f32-igemm/gen/1x8-avx-broadcast.c
+  src/f32-igemm/gen/4x8-avx-broadcast.c
+  src/f32-igemm/gen/5x8-avx-broadcast.c
+  src/f32-igemm/gen/6x8-avx-broadcast.c
+  src/f32-igemm/gen/7x8-avx-broadcast.c
   src/f32-rmax/avx.c
   src/f32-vscale/avx-unroll32.c)
 
 SET(XNNPACK_FMA3_MICROKERNEL_SRCS
-  src/f32-dwconv/up16x4-fma3-acc2.c
-  src/f32-dwconv/up16x4-fma3.c
-  src/f32-dwconv/up8x4-fma3-acc2.c
-  src/f32-dwconv/up8x4-fma3.c
-  src/f32-dwconv/up16x9-fma3-acc2.c
-  src/f32-dwconv/up16x9-fma3.c
-  src/f32-dwconv/up8x9-fma3-acc2.c
-  src/f32-dwconv/up8x9-fma3.c
-  src/f32-dwconv/up16x25-fma3-acc2.c
-  src/f32-dwconv/up16x25-fma3.c
-  src/f32-dwconv/up8x25-fma3-acc2.c
-  src/f32-dwconv/up8x25-fma3.c
-  src/f32-gemm/1x8-fma3-broadcast.c
-  src/f32-gemm/4x8-fma3-broadcast.c
-  src/f32-gemm/5x8-fma3-broadcast.c
-  src/f32-gemm/6x8-fma3-broadcast.c
-  src/f32-gemm/7x8-fma3-broadcast.c
-  src/f32-gemm/8x8-fma3-broadcast.c
-  src/f32-gemminc/1x8-fma3-broadcast.c
-  src/f32-gemminc/4x8-fma3-broadcast.c
-  src/f32-gemminc/5x8-fma3-broadcast.c
-  src/f32-gemminc/6x8-fma3-broadcast.c
-  src/f32-gemminc/7x8-fma3-broadcast.c
-  src/f32-gemminc/8x8-fma3-broadcast.c
-  src/f32-igemm/1x8-fma3-broadcast.c
-  src/f32-igemm/4x8-fma3-broadcast.c
-  src/f32-igemm/5x8-fma3-broadcast.c
-  src/f32-igemm/6x8-fma3-broadcast.c
-  src/f32-igemm/7x8-fma3-broadcast.c
-  src/f32-igemm/8x8-fma3-broadcast.c)
+  src/f32-dwconv/gen/up16x4-fma3-acc2.c
+  src/f32-dwconv/gen/up16x4-fma3.c
+  src/f32-dwconv/gen/up8x4-fma3-acc2.c
+  src/f32-dwconv/gen/up8x4-fma3.c
+  src/f32-dwconv/gen/up16x9-fma3-acc2.c
+  src/f32-dwconv/gen/up16x9-fma3.c
+  src/f32-dwconv/gen/up8x9-fma3-acc2.c
+  src/f32-dwconv/gen/up8x9-fma3.c
+  src/f32-dwconv/gen/up16x25-fma3-acc2.c
+  src/f32-dwconv/gen/up16x25-fma3.c
+  src/f32-dwconv/gen/up8x25-fma3-acc2.c
+  src/f32-dwconv/gen/up8x25-fma3.c
+  src/f32-gemm/gen/1x8-fma3-broadcast.c
+  src/f32-gemm/gen/4x8-fma3-broadcast.c
+  src/f32-gemm/gen/5x8-fma3-broadcast.c
+  src/f32-gemm/gen/6x8-fma3-broadcast.c
+  src/f32-gemm/gen/7x8-fma3-broadcast.c
+  src/f32-gemm/gen/8x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/1x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/4x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/5x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/6x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/7x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/8x8-fma3-broadcast.c
+  src/f32-igemm/gen/1x8-fma3-broadcast.c
+  src/f32-igemm/gen/4x8-fma3-broadcast.c
+  src/f32-igemm/gen/5x8-fma3-broadcast.c
+  src/f32-igemm/gen/6x8-fma3-broadcast.c
+  src/f32-igemm/gen/7x8-fma3-broadcast.c
+  src/f32-igemm/gen/8x8-fma3-broadcast.c)
 
 SET(XNNPACK_AVX2_MICROKERNEL_SRCS
   src/f32-raddexpminusmax/avx2-p5-unroll64.c
@@ -757,40 +757,40 @@
 SET(XNNPACK_AARCH64_ASM_MICROKERNEL_SRCS
   src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
   src/f32-dwconv/up4x9-aarch64-neonfma.S
-  src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/4x8-aarch64-neonfma-ld128.S
-  src/f32-gemm/4x8-aarch64-neonfma-ld64.S
-  src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/6x8-aarch64-neonfma-ld128.S
-  src/f32-gemm/6x8-aarch64-neonfma-ld64.S
-  src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/1x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/4x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/4x8-aarch64-neonfma-ld128.S
-  src/f32-gemminc/4x8-aarch64-neonfma-ld64.S
-  src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/6x8-aarch64-neonfma-ld128.S
-  src/f32-gemminc/6x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
   src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S
   src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S
   src/f32-igemm/1x8-aarch64-neonfma-cortex-a57.S