Move generated micro-kernels into a subdirectory

PiperOrigin-RevId: 282322486
diff --git a/BUILD.bazel b/BUILD.bazel
index f4acfe6..a56cd80 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -79,86 +79,86 @@
     "src/f32-argmaxpool/9x-scalar-c1.c",
     "src/f32-avgpool/mp9p8q-scalar.c",
     "src/f32-avgpool/up9-scalar.c",
-    "src/f32-bilinear/scalar-c1.c",
-    "src/f32-bilinear/scalar-c2.c",
-    "src/f32-bilinear/scalar-c4.c",
+    "src/f32-bilinear/gen/scalar-c1.c",
+    "src/f32-bilinear/gen/scalar-c2.c",
+    "src/f32-bilinear/gen/scalar-c4.c",
     "src/f32-clamp/scalar.c",
     "src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c",
     "src/f32-dwconv-spchw/3x3p1-scalar.c",
     "src/f32-dwconv-spchw/3x3s2p1-scalar.c",
     "src/f32-dwconv-spchw/5x5p2-scalar.c",
     "src/f32-dwconv-spchw/5x5s2p2-scalar.c",
-    "src/f32-dwconv/up1x25-scalar-acc2.c",
-    "src/f32-dwconv/up1x25-scalar.c",
-    "src/f32-dwconv/up1x4-scalar-acc2.c",
-    "src/f32-dwconv/up1x4-scalar.c",
-    "src/f32-dwconv/up1x9-scalar-acc2.c",
-    "src/f32-dwconv/up1x9-scalar.c",
-    "src/f32-dwconv/up2x25-scalar-acc2.c",
-    "src/f32-dwconv/up2x25-scalar.c",
-    "src/f32-dwconv/up2x4-scalar-acc2.c",
-    "src/f32-dwconv/up2x4-scalar.c",
-    "src/f32-dwconv/up2x9-scalar-acc2.c",
-    "src/f32-dwconv/up2x9-scalar.c",
+    "src/f32-dwconv/gen/up1x25-scalar-acc2.c",
+    "src/f32-dwconv/gen/up1x25-scalar.c",
+    "src/f32-dwconv/gen/up1x4-scalar-acc2.c",
+    "src/f32-dwconv/gen/up1x4-scalar.c",
+    "src/f32-dwconv/gen/up1x9-scalar-acc2.c",
+    "src/f32-dwconv/gen/up1x9-scalar.c",
+    "src/f32-dwconv/gen/up2x25-scalar-acc2.c",
+    "src/f32-dwconv/gen/up2x25-scalar.c",
+    "src/f32-dwconv/gen/up2x4-scalar-acc2.c",
+    "src/f32-dwconv/gen/up2x4-scalar.c",
+    "src/f32-dwconv/gen/up2x9-scalar-acc2.c",
+    "src/f32-dwconv/gen/up2x9-scalar.c",
     "src/f32-gavgpool-spchw/scalar-x1.c",
     "src/f32-gavgpool/mp7p7q-scalar.c",
     "src/f32-gavgpool/up7-scalar.c",
-    "src/f32-gemm/1x4-scalar.c",
-    "src/f32-gemm/2x4-scalar.c",
-    "src/f32-gemm/4x2-scalar.c",
-    "src/f32-gemm/4x4-scalar.c",
-    "src/f32-gemminc/1x4-scalar.c",
-    "src/f32-gemminc/2x4-scalar.c",
-    "src/f32-gemminc/4x4-scalar.c",
+    "src/f32-gemm/gen-inc/1x4-scalar.c",
+    "src/f32-gemm/gen-inc/2x4-scalar.c",
+    "src/f32-gemm/gen-inc/4x4-scalar.c",
+    "src/f32-gemm/gen/1x4-scalar.c",
+    "src/f32-gemm/gen/2x4-scalar.c",
+    "src/f32-gemm/gen/4x2-scalar.c",
+    "src/f32-gemm/gen/4x4-scalar.c",
     "src/f32-hswish/scalar.c",
-    "src/f32-igemm/1x4-scalar.c",
-    "src/f32-igemm/2x4-scalar.c",
-    "src/f32-igemm/4x2-scalar.c",
-    "src/f32-igemm/4x4-scalar.c",
+    "src/f32-igemm/gen/1x4-scalar.c",
+    "src/f32-igemm/gen/2x4-scalar.c",
+    "src/f32-igemm/gen/4x2-scalar.c",
+    "src/f32-igemm/gen/4x4-scalar.c",
     "src/f32-maxpool/9p8x-scalar-c1.c",
     "src/f32-pavgpool/mp9p8q-scalar.c",
     "src/f32-pavgpool/up9-scalar.c",
-    "src/f32-ppmm/2x4-scalar.c",
-    "src/f32-ppmm/3x3-scalar.c",
-    "src/f32-ppmm/4x2-scalar.c",
-    "src/f32-ppmm/4x4-scalar.c",
-    "src/f32-prelu/scalar-2x1.c",
-    "src/f32-prelu/scalar-2x4.c",
+    "src/f32-ppmm/gen/2x4-scalar.c",
+    "src/f32-ppmm/gen/3x3-scalar.c",
+    "src/f32-ppmm/gen/4x2-scalar.c",
+    "src/f32-ppmm/gen/4x4-scalar.c",
+    "src/f32-prelu/gen/scalar-2x1.c",
+    "src/f32-prelu/gen/scalar-2x4.c",
     "src/f32-rmax/scalar.c",
-    "src/f32-spmm/1x1-scalar-pipelined.c",
-    "src/f32-spmm/1x1-scalar.c",
-    "src/f32-spmm/2x1-scalar-pipelined.c",
-    "src/f32-spmm/2x1-scalar.c",
-    "src/f32-spmm/4x1-scalar-pipelined.c",
-    "src/f32-spmm/4x1-scalar.c",
-    "src/f32-spmm/8x1-scalar-pipelined.c",
-    "src/f32-spmm/8x1-scalar.c",
-    "src/f32-spmm/8x2-scalar.c",
-    "src/f32-spmm/8x4-scalar.c",
-    "src/f32-vbinary/vadd-scalar-x1.c",
-    "src/f32-vbinary/vadd-scalar-x2.c",
-    "src/f32-vbinary/vadd-scalar-x4.c",
-    "src/f32-vbinary/vaddc-scalar-x1.c",
-    "src/f32-vbinary/vaddc-scalar-x2.c",
-    "src/f32-vbinary/vaddc-scalar-x4.c",
-    "src/f32-vbinary/vmul-scalar-x1.c",
-    "src/f32-vbinary/vmul-scalar-x2.c",
-    "src/f32-vbinary/vmul-scalar-x4.c",
-    "src/f32-vbinary/vmulc-scalar-x1.c",
-    "src/f32-vbinary/vmulc-scalar-x2.c",
-    "src/f32-vbinary/vmulc-scalar-x4.c",
-    "src/f32-vbinary/vrsubc-scalar-x1.c",
-    "src/f32-vbinary/vrsubc-scalar-x2.c",
-    "src/f32-vbinary/vrsubc-scalar-x4.c",
-    "src/f32-vbinary/vsub-scalar-x1.c",
-    "src/f32-vbinary/vsub-scalar-x2.c",
-    "src/f32-vbinary/vsub-scalar-x4.c",
-    "src/f32-vbinary/vsubc-scalar-x1.c",
-    "src/f32-vbinary/vsubc-scalar-x2.c",
-    "src/f32-vbinary/vsubc-scalar-x4.c",
-    "src/f32-vmulcaddc/c1-scalar-2x.c",
-    "src/f32-vmulcaddc/c2-scalar-2x.c",
-    "src/f32-vmulcaddc/c4-scalar-2x.c",
+    "src/f32-spmm/gen/1x1-scalar-pipelined.c",
+    "src/f32-spmm/gen/1x1-scalar.c",
+    "src/f32-spmm/gen/2x1-scalar-pipelined.c",
+    "src/f32-spmm/gen/2x1-scalar.c",
+    "src/f32-spmm/gen/4x1-scalar-pipelined.c",
+    "src/f32-spmm/gen/4x1-scalar.c",
+    "src/f32-spmm/gen/8x1-scalar-pipelined.c",
+    "src/f32-spmm/gen/8x1-scalar.c",
+    "src/f32-spmm/gen/8x2-scalar.c",
+    "src/f32-spmm/gen/8x4-scalar.c",
+    "src/f32-vbinary/gen/vadd-scalar-x1.c",
+    "src/f32-vbinary/gen/vadd-scalar-x2.c",
+    "src/f32-vbinary/gen/vadd-scalar-x4.c",
+    "src/f32-vbinary/gen/vaddc-scalar-x1.c",
+    "src/f32-vbinary/gen/vaddc-scalar-x2.c",
+    "src/f32-vbinary/gen/vaddc-scalar-x4.c",
+    "src/f32-vbinary/gen/vmul-scalar-x1.c",
+    "src/f32-vbinary/gen/vmul-scalar-x2.c",
+    "src/f32-vbinary/gen/vmul-scalar-x4.c",
+    "src/f32-vbinary/gen/vmulc-scalar-x1.c",
+    "src/f32-vbinary/gen/vmulc-scalar-x2.c",
+    "src/f32-vbinary/gen/vmulc-scalar-x4.c",
+    "src/f32-vbinary/gen/vrsubc-scalar-x1.c",
+    "src/f32-vbinary/gen/vrsubc-scalar-x2.c",
+    "src/f32-vbinary/gen/vrsubc-scalar-x4.c",
+    "src/f32-vbinary/gen/vsub-scalar-x1.c",
+    "src/f32-vbinary/gen/vsub-scalar-x2.c",
+    "src/f32-vbinary/gen/vsub-scalar-x4.c",
+    "src/f32-vbinary/gen/vsubc-scalar-x1.c",
+    "src/f32-vbinary/gen/vsubc-scalar-x2.c",
+    "src/f32-vbinary/gen/vsubc-scalar-x4.c",
+    "src/f32-vmulcaddc/gen/c1-scalar-2x.c",
+    "src/f32-vmulcaddc/gen/c2-scalar-2x.c",
+    "src/f32-vmulcaddc/gen/c4-scalar-2x.c",
     "src/q8-avgpool/mp9p8q-scalar.c",
     "src/q8-avgpool/up9-scalar.c",
     "src/q8-dwconv/up1x9-scalar.c",
@@ -193,74 +193,74 @@
     "src/f32-argmaxpool/9x-psimd-c4.c",
     "src/f32-avgpool/mp9p8q-psimd.c",
     "src/f32-avgpool/up9-psimd.c",
-    "src/f32-bilinear/psimd-c4.c",
-    "src/f32-bilinear/psimd-c8.c",
+    "src/f32-bilinear/gen/psimd-c4.c",
+    "src/f32-bilinear/gen/psimd-c8.c",
     "src/f32-clamp/psimd.c",
-    "src/f32-dwconv/up4x25-psimd-acc2.c",
-    "src/f32-dwconv/up4x25-psimd.c",
-    "src/f32-dwconv/up4x4-psimd-acc2.c",
-    "src/f32-dwconv/up4x4-psimd.c",
-    "src/f32-dwconv/up4x9-psimd-acc2.c",
-    "src/f32-dwconv/up4x9-psimd.c",
-    "src/f32-dwconv/up8x25-psimd-acc2.c",
-    "src/f32-dwconv/up8x25-psimd.c",
-    "src/f32-dwconv/up8x4-psimd-acc2.c",
-    "src/f32-dwconv/up8x4-psimd.c",
-    "src/f32-dwconv/up8x9-psimd-acc2.c",
-    "src/f32-dwconv/up8x9-psimd.c",
+    "src/f32-dwconv/gen/up4x25-psimd-acc2.c",
+    "src/f32-dwconv/gen/up4x25-psimd.c",
+    "src/f32-dwconv/gen/up4x4-psimd-acc2.c",
+    "src/f32-dwconv/gen/up4x4-psimd.c",
+    "src/f32-dwconv/gen/up4x9-psimd-acc2.c",
+    "src/f32-dwconv/gen/up4x9-psimd.c",
+    "src/f32-dwconv/gen/up8x25-psimd-acc2.c",
+    "src/f32-dwconv/gen/up8x25-psimd.c",
+    "src/f32-dwconv/gen/up8x4-psimd-acc2.c",
+    "src/f32-dwconv/gen/up8x4-psimd.c",
+    "src/f32-dwconv/gen/up8x9-psimd-acc2.c",
+    "src/f32-dwconv/gen/up8x9-psimd.c",
     "src/f32-gavgpool/mp7p7q-psimd.c",
     "src/f32-gavgpool/up7-psimd.c",
-    "src/f32-gemm/1x8-psimd-loadsplat.c",
-    "src/f32-gemm/1x8-psimd-splat.c",
-    "src/f32-gemm/1x8s4-psimd.c",
-    "src/f32-gemm/4x8-psimd-loadsplat.c",
-    "src/f32-gemm/4x8-psimd-splat.c",
-    "src/f32-gemm/4x8s4-psimd.c",
-    "src/f32-gemm/6x8-psimd-loadsplat.c",
-    "src/f32-gemm/6x8-psimd-splat.c",
-    "src/f32-gemm/6x8s4-psimd.c",
-    "src/f32-gemminc/1x8-psimd-loadsplat.c",
-    "src/f32-gemminc/1x8-psimd-splat.c",
-    "src/f32-gemminc/1x8s4-psimd.c",
-    "src/f32-gemminc/4x8-psimd-loadsplat.c",
-    "src/f32-gemminc/4x8-psimd-splat.c",
-    "src/f32-gemminc/4x8s4-psimd.c",
-    "src/f32-gemminc/6x8-psimd-loadsplat.c",
-    "src/f32-gemminc/6x8-psimd-splat.c",
-    "src/f32-gemminc/6x8s4-psimd.c",
+    "src/f32-gemm/gen/1x8-psimd-loadsplat.c",
+    "src/f32-gemm/gen/1x8-psimd-splat.c",
+    "src/f32-gemm/gen/1x8s4-psimd.c",
+    "src/f32-gemm/gen/4x8-psimd-loadsplat.c",
+    "src/f32-gemm/gen/4x8-psimd-splat.c",
+    "src/f32-gemm/gen/4x8s4-psimd.c",
+    "src/f32-gemm/gen/6x8-psimd-loadsplat.c",
+    "src/f32-gemm/gen/6x8-psimd-splat.c",
+    "src/f32-gemm/gen/6x8s4-psimd.c",
+    "src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c",
+    "src/f32-gemm/gen-inc/1x8-psimd-splat.c",
+    "src/f32-gemm/gen-inc/1x8s4-psimd.c",
+    "src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c",
+    "src/f32-gemm/gen-inc/4x8-psimd-splat.c",
+    "src/f32-gemm/gen-inc/4x8s4-psimd.c",
+    "src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c",
+    "src/f32-gemm/gen-inc/6x8-psimd-splat.c",
+    "src/f32-gemm/gen-inc/6x8s4-psimd.c",
     "src/f32-hswish/psimd.c",
-    "src/f32-igemm/1x8-psimd-loadsplat.c",
-    "src/f32-igemm/1x8-psimd-splat.c",
-    "src/f32-igemm/1x8s4-psimd.c",
-    "src/f32-igemm/4x2c4-psimd.c",
-    "src/f32-igemm/4x8-psimd-loadsplat.c",
-    "src/f32-igemm/4x8-psimd-splat.c",
-    "src/f32-igemm/4x8s4-psimd.c",
-    "src/f32-igemm/6x8-psimd-loadsplat.c",
-    "src/f32-igemm/6x8-psimd-splat.c",
-    "src/f32-igemm/6x8s4-psimd.c",
+    "src/f32-igemm/gen/1x8-psimd-loadsplat.c",
+    "src/f32-igemm/gen/1x8-psimd-splat.c",
+    "src/f32-igemm/gen/1x8s4-psimd.c",
+    "src/f32-igemm/gen/4x2c4-psimd.c",
+    "src/f32-igemm/gen/4x8-psimd-loadsplat.c",
+    "src/f32-igemm/gen/4x8-psimd-splat.c",
+    "src/f32-igemm/gen/4x8s4-psimd.c",
+    "src/f32-igemm/gen/6x8-psimd-loadsplat.c",
+    "src/f32-igemm/gen/6x8-psimd-splat.c",
+    "src/f32-igemm/gen/6x8s4-psimd.c",
     "src/f32-maxpool/9p8x-psimd-c4.c",
     "src/f32-pavgpool/mp9p8q-psimd.c",
     "src/f32-pavgpool/up9-psimd.c",
-    "src/f32-ppmm/4x8-psimd.c",
-    "src/f32-prelu/psimd-2x4.c",
-    "src/f32-prelu/psimd-2x8.c",
-    "src/f32-vbinary/vadd-psimd-x4.c",
-    "src/f32-vbinary/vadd-psimd-x8.c",
-    "src/f32-vbinary/vaddc-psimd-x4.c",
-    "src/f32-vbinary/vaddc-psimd-x8.c",
-    "src/f32-vbinary/vmul-psimd-x4.c",
-    "src/f32-vbinary/vmul-psimd-x8.c",
-    "src/f32-vbinary/vmulc-psimd-x4.c",
-    "src/f32-vbinary/vmulc-psimd-x8.c",
-    "src/f32-vbinary/vrsubc-psimd-x4.c",
-    "src/f32-vbinary/vrsubc-psimd-x8.c",
-    "src/f32-vbinary/vsub-psimd-x4.c",
-    "src/f32-vbinary/vsub-psimd-x8.c",
-    "src/f32-vbinary/vsubc-psimd-x4.c",
-    "src/f32-vbinary/vsubc-psimd-x8.c",
-    "src/f32-vmulcaddc/c4-psimd-2x.c",
-    "src/f32-vmulcaddc/c8-psimd-2x.c",
+    "src/f32-ppmm/gen/4x8-psimd.c",
+    "src/f32-prelu/gen/psimd-2x4.c",
+    "src/f32-prelu/gen/psimd-2x8.c",
+    "src/f32-vbinary/gen/vadd-psimd-x4.c",
+    "src/f32-vbinary/gen/vadd-psimd-x8.c",
+    "src/f32-vbinary/gen/vaddc-psimd-x4.c",
+    "src/f32-vbinary/gen/vaddc-psimd-x8.c",
+    "src/f32-vbinary/gen/vmul-psimd-x4.c",
+    "src/f32-vbinary/gen/vmul-psimd-x8.c",
+    "src/f32-vbinary/gen/vmulc-psimd-x4.c",
+    "src/f32-vbinary/gen/vmulc-psimd-x8.c",
+    "src/f32-vbinary/gen/vrsubc-psimd-x4.c",
+    "src/f32-vbinary/gen/vrsubc-psimd-x8.c",
+    "src/f32-vbinary/gen/vsub-psimd-x4.c",
+    "src/f32-vbinary/gen/vsub-psimd-x8.c",
+    "src/f32-vbinary/gen/vsubc-psimd-x4.c",
+    "src/f32-vbinary/gen/vsubc-psimd-x8.c",
+    "src/f32-vmulcaddc/gen/c4-psimd-2x.c",
+    "src/f32-vmulcaddc/gen/c8-psimd-2x.c",
     "src/x32-packx/x4-psimd.c",
     "src/x32-pad/x2-psimd.c",
     "src/x32-unpool/psimd.c",
@@ -274,82 +274,82 @@
 NEON_UKERNELS = [
     "src/f32-avgpool/mp9p8q-neon.c",
     "src/f32-avgpool/up9-neon.c",
-    "src/f32-bilinear/neon-c4.c",
-    "src/f32-bilinear/neon-c8.c",
+    "src/f32-bilinear/gen/neon-c4.c",
+    "src/f32-bilinear/gen/neon-c8.c",
     "src/f32-clamp/neon.c",
-    "src/f32-dwconv/up4x9-neon.c",
-    "src/f32-dwconv/up4x9-neon-acc2.c",
-    "src/f32-dwconv/up8x9-neon.c",
-    "src/f32-dwconv/up8x9-neon-acc2.c",
+    "src/f32-dwconv/gen/up4x9-neon.c",
+    "src/f32-dwconv/gen/up4x9-neon-acc2.c",
+    "src/f32-dwconv/gen/up8x9-neon.c",
+    "src/f32-dwconv/gen/up8x9-neon-acc2.c",
     "src/f32-gavgpool-spchw/neon-x4.c",
     "src/f32-gavgpool/mp7p7q-neon.c",
     "src/f32-gavgpool/up7-neon.c",
-    "src/f32-gemm/1x8-neon-lane-ld64.c",
-    "src/f32-gemm/4x2-neon-lane-ld64.c",
-    "src/f32-gemm/4x8-neon-lane-ld128.c",
-    "src/f32-gemm/4x8-neon-lane-ld64.c",
-    "src/f32-gemm/5x8-neon-lane-ld64.c",
-    "src/f32-gemm/6x8-neon-lane-ld64.c",
-    "src/f32-gemm/1x8-neon-dup-ld64.c",
-    "src/f32-gemm/4x8-neon-dup-ld128.c",
-    "src/f32-gemm/4x8-neon-dup-ld64.c",
-    "src/f32-gemm/6x8-neon-dup-ld64.c",
-    "src/f32-gemm/1x8s4-neon.c",
-    "src/f32-gemm/4x8s4-neon.c",
-    "src/f32-gemm/6x8s4-neon.c",
-    "src/f32-gemm/8x8s4-neon.c",
-    "src/f32-gemminc/1x8-neon-lane-ld64.c",
-    "src/f32-gemminc/4x8-neon-lane-ld128.c",
-    "src/f32-gemminc/4x8-neon-lane-ld64.c",
-    "src/f32-gemminc/5x8-neon-lane-ld64.c",
-    "src/f32-gemminc/6x8-neon-lane-ld64.c",
-    "src/f32-gemminc/1x8-neon-dup-ld64.c",
-    "src/f32-gemminc/4x8-neon-dup-ld128.c",
-    "src/f32-gemminc/4x8-neon-dup-ld64.c",
-    "src/f32-gemminc/6x8-neon-dup-ld64.c",
-    "src/f32-gemminc/1x8s4-neon.c",
-    "src/f32-gemminc/4x8s4-neon.c",
-    "src/f32-gemminc/6x8s4-neon.c",
-    "src/f32-gemminc/8x8s4-neon.c",
+    "src/f32-gemm/gen/1x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen/4x2-neon-lane-ld64.c",
+    "src/f32-gemm/gen/4x8-neon-lane-ld128.c",
+    "src/f32-gemm/gen/4x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen/5x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen/6x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen/1x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen/4x8-neon-dup-ld128.c",
+    "src/f32-gemm/gen/4x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen/6x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen/1x8s4-neon.c",
+    "src/f32-gemm/gen/4x8s4-neon.c",
+    "src/f32-gemm/gen/6x8s4-neon.c",
+    "src/f32-gemm/gen/8x8s4-neon.c",
+    "src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c",
+    "src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c",
+    "src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c",
+    "src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c",
+    "src/f32-gemm/gen-inc/1x8s4-neon.c",
+    "src/f32-gemm/gen-inc/4x8s4-neon.c",
+    "src/f32-gemm/gen-inc/6x8s4-neon.c",
+    "src/f32-gemm/gen-inc/8x8s4-neon.c",
     "src/f32-hswish/neon.c",
-    "src/f32-igemm/1x8-neon-lane-ld64.c",
-    "src/f32-igemm/4x2-neon-lane-ld64.c",
-    "src/f32-igemm/4x4-neon-lane-ld64.c",
-    "src/f32-igemm/4x8-neon-lane-ld128.c",
-    "src/f32-igemm/4x8-neon-lane-ld64.c",
-    "src/f32-igemm/6x8-neon-lane-ld64.c",
-    "src/f32-igemm/1x8-neon-dup-ld64.c",
-    "src/f32-igemm/4x8-neon-dup-ld128.c",
-    "src/f32-igemm/4x8-neon-dup-ld64.c",
-    "src/f32-igemm/6x8-neon-dup-ld64.c",
-    "src/f32-igemm/1x8s4-neon.c",
-    "src/f32-igemm/4x8s4-neon.c",
-    "src/f32-igemm/6x8s4-neon.c",
-    "src/f32-igemm/8x8s4-neon.c",
+    "src/f32-igemm/gen/1x8-neon-lane-ld64.c",
+    "src/f32-igemm/gen/4x2-neon-lane-ld64.c",
+    "src/f32-igemm/gen/4x4-neon-lane-ld64.c",
+    "src/f32-igemm/gen/4x8-neon-lane-ld128.c",
+    "src/f32-igemm/gen/4x8-neon-lane-ld64.c",
+    "src/f32-igemm/gen/6x8-neon-lane-ld64.c",
+    "src/f32-igemm/gen/1x8-neon-dup-ld64.c",
+    "src/f32-igemm/gen/4x8-neon-dup-ld128.c",
+    "src/f32-igemm/gen/4x8-neon-dup-ld64.c",
+    "src/f32-igemm/gen/6x8-neon-dup-ld64.c",
+    "src/f32-igemm/gen/1x8s4-neon.c",
+    "src/f32-igemm/gen/4x8s4-neon.c",
+    "src/f32-igemm/gen/6x8s4-neon.c",
+    "src/f32-igemm/gen/8x8s4-neon.c",
     "src/f32-pavgpool/mp9p8q-neon.c",
     "src/f32-pavgpool/up9-neon.c",
-    "src/f32-ppmm/4x8-neon.c",
-    "src/f32-ppmm/8x8-neon.c",
-    "src/f32-prelu/neon-2x4.c",
-    "src/f32-prelu/neon-2x8.c",
+    "src/f32-ppmm/gen/4x8-neon.c",
+    "src/f32-ppmm/gen/8x8-neon.c",
+    "src/f32-prelu/gen/neon-2x4.c",
+    "src/f32-prelu/gen/neon-2x8.c",
     "src/f32-rmax/neon.c",
-    "src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c",
-    "src/f32-vbinary/vadd-neon-x4.c",
-    "src/f32-vbinary/vadd-neon-x8.c",
-    "src/f32-vbinary/vaddc-neon-x4.c",
-    "src/f32-vbinary/vaddc-neon-x8.c",
-    "src/f32-vbinary/vmul-neon-x4.c",
-    "src/f32-vbinary/vmul-neon-x8.c",
-    "src/f32-vbinary/vmulc-neon-x4.c",
-    "src/f32-vbinary/vmulc-neon-x8.c",
-    "src/f32-vbinary/vrsubc-neon-x4.c",
-    "src/f32-vbinary/vrsubc-neon-x8.c",
-    "src/f32-vbinary/vsub-neon-x4.c",
-    "src/f32-vbinary/vsub-neon-x8.c",
-    "src/f32-vbinary/vsubc-neon-x4.c",
-    "src/f32-vbinary/vsubc-neon-x8.c",
-    "src/f32-vmulcaddc/c4-neon-2x.c",
-    "src/f32-vmulcaddc/c8-neon-2x.c",
+    "src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c",
+    "src/f32-vbinary/gen/vadd-neon-x4.c",
+    "src/f32-vbinary/gen/vadd-neon-x8.c",
+    "src/f32-vbinary/gen/vaddc-neon-x4.c",
+    "src/f32-vbinary/gen/vaddc-neon-x8.c",
+    "src/f32-vbinary/gen/vmul-neon-x4.c",
+    "src/f32-vbinary/gen/vmul-neon-x8.c",
+    "src/f32-vbinary/gen/vmulc-neon-x4.c",
+    "src/f32-vbinary/gen/vmulc-neon-x8.c",
+    "src/f32-vbinary/gen/vrsubc-neon-x4.c",
+    "src/f32-vbinary/gen/vrsubc-neon-x8.c",
+    "src/f32-vbinary/gen/vsub-neon-x4.c",
+    "src/f32-vbinary/gen/vsub-neon-x8.c",
+    "src/f32-vbinary/gen/vsubc-neon-x4.c",
+    "src/f32-vbinary/gen/vsubc-neon-x8.c",
+    "src/f32-vmulcaddc/gen/c4-neon-2x.c",
+    "src/f32-vmulcaddc/gen/c8-neon-2x.c",
     "src/q8-avgpool/mp9p8q-neon.c",
     "src/q8-avgpool/up9-neon.c",
     "src/q8-dwconv/up8x9-neon.c",
@@ -376,42 +376,42 @@
 ]
 
 NEONFMA_UKERNELS = [
-    "src/f32-bilinear/neonfma-c4.c",
-    "src/f32-bilinear/neonfma-c8.c",
-    "src/f32-igemm/1x8-neonfma-dup-ld64.c",
-    "src/f32-igemm/4x8-neonfma-dup-ld128.c",
-    "src/f32-igemm/4x8-neonfma-dup-ld64.c",
-    "src/f32-igemm/6x8-neonfma-dup-ld64.c",
-    "src/f32-igemm/1x8s4-neonfma.c",
-    "src/f32-igemm/4x8s4-neonfma.c",
-    "src/f32-igemm/6x8s4-neonfma.c",
-    "src/f32-igemm/8x8s4-neonfma.c",
-    "src/f32-dwconv/up4x9-neonfma.c",
-    "src/f32-dwconv/up4x9-neonfma-acc2.c",
-    "src/f32-dwconv/up8x9-neonfma.c",
-    "src/f32-dwconv/up8x9-neonfma-acc2.c",
-    "src/f32-gemm/1x8-neonfma-dup-ld64.c",
-    "src/f32-gemm/4x8-neonfma-dup-ld128.c",
-    "src/f32-gemm/4x8-neonfma-dup-ld64.c",
-    "src/f32-gemm/6x8-neonfma-dup-ld64.c",
-    "src/f32-gemm/1x8s4-neonfma.c",
-    "src/f32-gemm/4x8s4-neonfma.c",
-    "src/f32-gemm/6x8s4-neonfma.c",
-    "src/f32-gemm/8x8s4-neonfma.c",
-    "src/f32-gemminc/1x8-neonfma-dup-ld64.c",
-    "src/f32-gemminc/4x8-neonfma-dup-ld128.c",
-    "src/f32-gemminc/4x8-neonfma-dup-ld64.c",
-    "src/f32-gemminc/6x8-neonfma-dup-ld64.c",
-    "src/f32-gemminc/1x8s4-neonfma.c",
-    "src/f32-gemminc/4x8s4-neonfma.c",
-    "src/f32-gemminc/6x8s4-neonfma.c",
-    "src/f32-gemminc/8x8s4-neonfma.c",
+    "src/f32-bilinear/gen/neonfma-c4.c",
+    "src/f32-bilinear/gen/neonfma-c8.c",
+    "src/f32-igemm/gen/1x8-neonfma-dup-ld64.c",
+    "src/f32-igemm/gen/4x8-neonfma-dup-ld128.c",
+    "src/f32-igemm/gen/4x8-neonfma-dup-ld64.c",
+    "src/f32-igemm/gen/6x8-neonfma-dup-ld64.c",
+    "src/f32-igemm/gen/1x8s4-neonfma.c",
+    "src/f32-igemm/gen/4x8s4-neonfma.c",
+    "src/f32-igemm/gen/6x8s4-neonfma.c",
+    "src/f32-igemm/gen/8x8s4-neonfma.c",
+    "src/f32-dwconv/gen/up4x9-neonfma.c",
+    "src/f32-dwconv/gen/up4x9-neonfma-acc2.c",
+    "src/f32-dwconv/gen/up8x9-neonfma.c",
+    "src/f32-dwconv/gen/up8x9-neonfma-acc2.c",
+    "src/f32-gemm/gen/1x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen/4x8-neonfma-dup-ld128.c",
+    "src/f32-gemm/gen/4x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen/6x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen/1x8s4-neonfma.c",
+    "src/f32-gemm/gen/4x8s4-neonfma.c",
+    "src/f32-gemm/gen/6x8s4-neonfma.c",
+    "src/f32-gemm/gen/8x8s4-neonfma.c",
+    "src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c",
+    "src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c",
+    "src/f32-gemm/gen-inc/1x8s4-neonfma.c",
+    "src/f32-gemm/gen-inc/4x8s4-neonfma.c",
+    "src/f32-gemm/gen-inc/6x8s4-neonfma.c",
+    "src/f32-gemm/gen-inc/8x8s4-neonfma.c",
     "src/f32-hswish/neonfma.c",
-    "src/f32-ppmm/4x8-neonfma.c",
-    "src/f32-ppmm/8x8-neonfma.c",
-    "src/f32-sigmoid/neonfma-p5-nr2fma-x16.c",
-    "src/f32-vmulcaddc/c4-neonfma-2x.c",
-    "src/f32-vmulcaddc/c8-neonfma-2x.c",
+    "src/f32-ppmm/gen/4x8-neonfma.c",
+    "src/f32-ppmm/gen/8x8-neonfma.c",
+    "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c",
+    "src/f32-vmulcaddc/gen/c4-neonfma-2x.c",
+    "src/f32-vmulcaddc/gen/c8-neonfma-2x.c",
     "src/math/exp-neonfma-lut64-p2.c",
     "src/math/exp-neonfma-p5.c",
     "src/math/expminus-neonfma-p5.c",
@@ -421,23 +421,23 @@
 ]
 
 AARCH64_NEONFMA_UKERNELS = [
-    "src/f32-gemm/1x8-neonfma-lane-ld64.c",
-    "src/f32-gemm/4x2-neonfma-lane-ld64.c",
-    "src/f32-gemm/4x8-neonfma-lane-ld128.c",
-    "src/f32-gemm/4x8-neonfma-lane-ld64.c",
-    "src/f32-gemm/5x8-neonfma-lane-ld64.c",
-    "src/f32-gemm/6x8-neonfma-lane-ld64.c",
-    "src/f32-gemminc/1x8-neonfma-lane-ld64.c",
-    "src/f32-gemminc/4x8-neonfma-lane-ld128.c",
-    "src/f32-gemminc/4x8-neonfma-lane-ld64.c",
-    "src/f32-gemminc/5x8-neonfma-lane-ld64.c",
-    "src/f32-gemminc/6x8-neonfma-lane-ld64.c",
-    "src/f32-igemm/1x8-neonfma-lane-ld64.c",
-    "src/f32-igemm/4x2-neonfma-lane-ld64.c",
-    "src/f32-igemm/4x4-neonfma-lane-ld64.c",
-    "src/f32-igemm/4x8-neonfma-lane-ld128.c",
-    "src/f32-igemm/4x8-neonfma-lane-ld64.c",
-    "src/f32-igemm/6x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen/1x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen/4x2-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen/4x8-neonfma-lane-ld128.c",
+    "src/f32-gemm/gen/4x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen/5x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen/6x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c",
+    "src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/gen/1x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/gen/4x2-neonfma-lane-ld64.c",
+    "src/f32-igemm/gen/4x4-neonfma-lane-ld64.c",
+    "src/f32-igemm/gen/4x8-neonfma-lane-ld128.c",
+    "src/f32-igemm/gen/4x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/gen/6x8-neonfma-lane-ld64.c",
     "src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c",
     "src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c",
     "src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c",
@@ -445,99 +445,99 @@
     "src/f32-dwconv-spchw/5x5p2-neonfma.c",
     "src/f32-dwconv-spchw/3x3s2p1-neonfma.c",
     "src/f32-dwconv-spchw/5x5s2p2-neonfma.c",
-    "src/f32-spmm/12x1-neonfma.c",
-    "src/f32-spmm/12x2-neonfma.c",
-    "src/f32-spmm/12x4-neonfma.c",
-    "src/f32-spmm/16x1-neonfma-pipelined.c",
-    "src/f32-spmm/16x1-neonfma-unroll2.c",
-    "src/f32-spmm/16x1-neonfma.c",
-    "src/f32-spmm/16x2-neonfma.c",
-    "src/f32-spmm/16x4-neonfma.c",
-    "src/f32-spmm/4x1-neonfma-pipelined.c",
-    "src/f32-spmm/4x1-neonfma-unroll2.c",
-    "src/f32-spmm/4x1-neonfma.c",
-    "src/f32-spmm/4x2-neonfma.c",
-    "src/f32-spmm/4x4-neonfma.c",
-    "src/f32-spmm/8x1-neonfma-pipelined.c",
-    "src/f32-spmm/8x1-neonfma-unroll2.c",
-    "src/f32-spmm/8x1-neonfma.c",
-    "src/f32-spmm/8x2-neonfma.c",
-    "src/f32-spmm/8x4-neonfma.c",
+    "src/f32-spmm/gen/12x1-neonfma.c",
+    "src/f32-spmm/gen/12x2-neonfma.c",
+    "src/f32-spmm/gen/12x4-neonfma.c",
+    "src/f32-spmm/gen/16x1-neonfma-pipelined.c",
+    "src/f32-spmm/gen/16x1-neonfma-unroll2.c",
+    "src/f32-spmm/gen/16x1-neonfma.c",
+    "src/f32-spmm/gen/16x2-neonfma.c",
+    "src/f32-spmm/gen/16x4-neonfma.c",
+    "src/f32-spmm/gen/4x1-neonfma-pipelined.c",
+    "src/f32-spmm/gen/4x1-neonfma-unroll2.c",
+    "src/f32-spmm/gen/4x1-neonfma.c",
+    "src/f32-spmm/gen/4x2-neonfma.c",
+    "src/f32-spmm/gen/4x4-neonfma.c",
+    "src/f32-spmm/gen/8x1-neonfma-pipelined.c",
+    "src/f32-spmm/gen/8x1-neonfma-unroll2.c",
+    "src/f32-spmm/gen/8x1-neonfma.c",
+    "src/f32-spmm/gen/8x2-neonfma.c",
+    "src/f32-spmm/gen/8x4-neonfma.c",
     "src/math/sigmoid-neonfma-p5-div.c",
 ]
 
 AARCH64_NEONFP16ARITH_UKERNELS = [
-    "src/f16-gemm/4x8-neonfp16arith-ld64.c",
-    "src/f16-gemm/6x8-neonfp16arith-ld64.c",
-    "src/f16-gemm/8x8-neonfp16arith-ld64.c",
+    "src/f16-gemm/gen/4x8-neonfp16arith-ld64.c",
+    "src/f16-gemm/gen/6x8-neonfp16arith-ld64.c",
+    "src/f16-gemm/gen/8x8-neonfp16arith-ld64.c",
 ]
 
 SSE_UKERNELS = [
     "src/f32-avgpool/mp9p8q-sse.c",
     "src/f32-avgpool/up9-sse.c",
-    "src/f32-bilinear/sse-c4.c",
-    "src/f32-bilinear/sse-c8.c",
+    "src/f32-bilinear/gen/sse-c4.c",
+    "src/f32-bilinear/gen/sse-c8.c",
     "src/f32-clamp/sse.c",
     "src/f32-dwconv-spchw/3x3p1-sse.c",
     "src/f32-dwconv-spchw/3x3s2p1-sse.c",
-    "src/f32-dwconv/up4x25-sse-acc2.c",
-    "src/f32-dwconv/up4x25-sse.c",
-    "src/f32-dwconv/up4x4-sse-acc2.c",
-    "src/f32-dwconv/up4x4-sse.c",
-    "src/f32-dwconv/up4x9-sse-acc2.c",
-    "src/f32-dwconv/up4x9-sse.c",
-    "src/f32-dwconv/up8x25-sse-acc2.c",
-    "src/f32-dwconv/up8x25-sse.c",
-    "src/f32-dwconv/up8x4-sse-acc2.c",
-    "src/f32-dwconv/up8x4-sse.c",
-    "src/f32-dwconv/up8x9-sse-acc2.c",
-    "src/f32-dwconv/up8x9-sse.c",
+    "src/f32-dwconv/gen/up4x25-sse-acc2.c",
+    "src/f32-dwconv/gen/up4x25-sse.c",
+    "src/f32-dwconv/gen/up4x4-sse-acc2.c",
+    "src/f32-dwconv/gen/up4x4-sse.c",
+    "src/f32-dwconv/gen/up4x9-sse-acc2.c",
+    "src/f32-dwconv/gen/up4x9-sse.c",
+    "src/f32-dwconv/gen/up8x25-sse-acc2.c",
+    "src/f32-dwconv/gen/up8x25-sse.c",
+    "src/f32-dwconv/gen/up8x4-sse-acc2.c",
+    "src/f32-dwconv/gen/up8x4-sse.c",
+    "src/f32-dwconv/gen/up8x9-sse-acc2.c",
+    "src/f32-dwconv/gen/up8x9-sse.c",
     "src/f32-gavgpool-spchw/sse-x4.c",
     "src/f32-gavgpool/mp7p7q-sse.c",
     "src/f32-gavgpool/up7-sse.c",
-    "src/f32-gemm/1x8-sse-dup.c",
-    "src/f32-gemm/1x8-sse-load1.c",
-    "src/f32-gemm/1x8s4-sse.c",
-    "src/f32-gemm/4x8-sse-dup.c",
-    "src/f32-gemm/4x8-sse-load1.c",
-    "src/f32-gemm/4x8s4-sse.c",
-    "src/f32-gemminc/1x8-sse-dup.c",
-    "src/f32-gemminc/1x8-sse-load1.c",
-    "src/f32-gemminc/1x8s4-sse.c",
-    "src/f32-gemminc/4x8-sse-dup.c",
-    "src/f32-gemminc/4x8-sse-load1.c",
-    "src/f32-gemminc/4x8s4-sse.c",
+    "src/f32-gemm/gen/1x8-sse-dup.c",
+    "src/f32-gemm/gen/1x8-sse-load1.c",
+    "src/f32-gemm/gen/1x8s4-sse.c",
+    "src/f32-gemm/gen/4x8-sse-dup.c",
+    "src/f32-gemm/gen/4x8-sse-load1.c",
+    "src/f32-gemm/gen/4x8s4-sse.c",
+    "src/f32-gemm/gen-inc/1x8-sse-dup.c",
+    "src/f32-gemm/gen-inc/1x8-sse-load1.c",
+    "src/f32-gemm/gen-inc/1x8s4-sse.c",
+    "src/f32-gemm/gen-inc/4x8-sse-dup.c",
+    "src/f32-gemm/gen-inc/4x8-sse-load1.c",
+    "src/f32-gemm/gen-inc/4x8s4-sse.c",
     "src/f32-hswish/sse.c",
-    "src/f32-igemm/1x8-sse-dup.c",
-    "src/f32-igemm/1x8-sse-load1.c",
-    "src/f32-igemm/1x8s4-sse.c",
-    "src/f32-igemm/4x2c4-sse.c",
-    "src/f32-igemm/4x8-sse-dup.c",
-    "src/f32-igemm/4x8-sse-load1.c",
-    "src/f32-igemm/4x8s4-sse.c",
+    "src/f32-igemm/gen/1x8-sse-dup.c",
+    "src/f32-igemm/gen/1x8-sse-load1.c",
+    "src/f32-igemm/gen/1x8s4-sse.c",
+    "src/f32-igemm/gen/4x2c4-sse.c",
+    "src/f32-igemm/gen/4x8-sse-dup.c",
+    "src/f32-igemm/gen/4x8-sse-load1.c",
+    "src/f32-igemm/gen/4x8s4-sse.c",
     "src/f32-maxpool/9p8x-sse-c4.c",
     "src/f32-pavgpool/mp9p8q-sse.c",
     "src/f32-pavgpool/up9-sse.c",
-    "src/f32-ppmm/4x8-sse.c",
+    "src/f32-ppmm/gen/4x8-sse.c",
     "src/f32-rmax/sse.c",
-    "src/f32-spmm/4x1-sse.c",
-    "src/f32-spmm/8x1-sse.c",
-    "src/f32-vbinary/vadd-sse-x4.c",
-    "src/f32-vbinary/vadd-sse-x8.c",
-    "src/f32-vbinary/vaddc-sse-x4.c",
-    "src/f32-vbinary/vaddc-sse-x8.c",
-    "src/f32-vbinary/vmul-sse-x4.c",
-    "src/f32-vbinary/vmul-sse-x8.c",
-    "src/f32-vbinary/vmulc-sse-x4.c",
-    "src/f32-vbinary/vmulc-sse-x8.c",
-    "src/f32-vbinary/vrsubc-sse-x4.c",
-    "src/f32-vbinary/vrsubc-sse-x8.c",
-    "src/f32-vbinary/vsub-sse-x4.c",
-    "src/f32-vbinary/vsub-sse-x8.c",
-    "src/f32-vbinary/vsubc-sse-x4.c",
-    "src/f32-vbinary/vsubc-sse-x8.c",
-    "src/f32-vmulcaddc/c4-sse-2x.c",
-    "src/f32-vmulcaddc/c8-sse-2x.c",
+    "src/f32-spmm/gen/4x1-sse.c",
+    "src/f32-spmm/gen/8x1-sse.c",
+    "src/f32-vbinary/gen/vadd-sse-x4.c",
+    "src/f32-vbinary/gen/vadd-sse-x8.c",
+    "src/f32-vbinary/gen/vaddc-sse-x4.c",
+    "src/f32-vbinary/gen/vaddc-sse-x8.c",
+    "src/f32-vbinary/gen/vmul-sse-x4.c",
+    "src/f32-vbinary/gen/vmul-sse-x8.c",
+    "src/f32-vbinary/gen/vmulc-sse-x4.c",
+    "src/f32-vbinary/gen/vmulc-sse-x8.c",
+    "src/f32-vbinary/gen/vrsubc-sse-x4.c",
+    "src/f32-vbinary/gen/vrsubc-sse-x8.c",
+    "src/f32-vbinary/gen/vsub-sse-x4.c",
+    "src/f32-vbinary/gen/vsub-sse-x8.c",
+    "src/f32-vbinary/gen/vsubc-sse-x4.c",
+    "src/f32-vbinary/gen/vsubc-sse-x8.c",
+    "src/f32-vmulcaddc/gen/c4-sse-2x.c",
+    "src/f32-vmulcaddc/gen/c8-sse-2x.c",
     "src/x32-packx/x4-sse.c",
 ]
 
@@ -545,10 +545,10 @@
     "src/f32-argmaxpool/9p8x-sse2-c4.c",
     "src/f32-argmaxpool/4x-sse2-c4.c",
     "src/f32-argmaxpool/9x-sse2-c4.c",
-    "src/f32-prelu/sse2-2x4.c",
-    "src/f32-prelu/sse2-2x8.c",
-    "src/f32-sigmoid/sse2-p5-div-x8.c",
-    "src/f32-sigmoid/sse2-p5-div-x16.c",
+    "src/f32-prelu/gen/sse2-2x4.c",
+    "src/f32-prelu/gen/sse2-2x8.c",
+    "src/f32-sigmoid/gen/sse2-p5-div-x8.c",
+    "src/f32-sigmoid/gen/sse2-p5-div-x16.c",
     "src/q8-avgpool/mp9p8q-sse2.c",
     "src/q8-avgpool/up9-sse2.c",
     "src/q8-igemm/4x4c2-sse2.c",
@@ -576,75 +576,75 @@
 ]
 
 SSE41_UKERNELS = [
-    "src/f32-prelu/sse41-2x4.c",
-    "src/f32-prelu/sse41-2x8.c",
-    "src/f32-sigmoid/sse41-p5-div-x8.c",
-    "src/f32-sigmoid/sse41-p5-div-x16.c",
+    "src/f32-prelu/gen/sse41-2x4.c",
+    "src/f32-prelu/gen/sse41-2x8.c",
+    "src/f32-sigmoid/gen/sse41-p5-div-x8.c",
+    "src/f32-sigmoid/gen/sse41-p5-div-x16.c",
 ]
 
 AVX_UKERNELS = [
-    "src/f32-dwconv/up16x4-avx-acc2.c",
-    "src/f32-dwconv/up16x4-avx.c",
-    "src/f32-dwconv/up8x4-avx-acc2.c",
-    "src/f32-dwconv/up8x4-avx.c",
-    "src/f32-dwconv/up16x9-avx-acc2.c",
-    "src/f32-dwconv/up16x9-avx.c",
-    "src/f32-dwconv/up8x9-avx-acc2.c",
-    "src/f32-dwconv/up8x9-avx.c",
-    "src/f32-dwconv/up16x25-avx-acc2.c",
-    "src/f32-dwconv/up16x25-avx.c",
-    "src/f32-dwconv/up8x25-avx-acc2.c",
-    "src/f32-dwconv/up8x25-avx.c",
-    "src/f32-gemm/1x8-avx-broadcast.c",
-    "src/f32-gemm/4x8-avx-broadcast.c",
-    "src/f32-gemm/5x8-avx-broadcast.c",
-    "src/f32-gemm/6x8-avx-broadcast.c",
-    "src/f32-gemm/7x8-avx-broadcast.c",
-    "src/f32-gemminc/1x8-avx-broadcast.c",
-    "src/f32-gemminc/4x8-avx-broadcast.c",
-    "src/f32-gemminc/5x8-avx-broadcast.c",
-    "src/f32-gemminc/6x8-avx-broadcast.c",
-    "src/f32-gemminc/7x8-avx-broadcast.c",
-    "src/f32-igemm/1x8-avx-broadcast.c",
-    "src/f32-igemm/4x8-avx-broadcast.c",
-    "src/f32-igemm/5x8-avx-broadcast.c",
-    "src/f32-igemm/6x8-avx-broadcast.c",
-    "src/f32-igemm/7x8-avx-broadcast.c",
+    "src/f32-dwconv/gen/up16x4-avx-acc2.c",
+    "src/f32-dwconv/gen/up16x4-avx.c",
+    "src/f32-dwconv/gen/up8x4-avx-acc2.c",
+    "src/f32-dwconv/gen/up8x4-avx.c",
+    "src/f32-dwconv/gen/up16x9-avx-acc2.c",
+    "src/f32-dwconv/gen/up16x9-avx.c",
+    "src/f32-dwconv/gen/up8x9-avx-acc2.c",
+    "src/f32-dwconv/gen/up8x9-avx.c",
+    "src/f32-dwconv/gen/up16x25-avx-acc2.c",
+    "src/f32-dwconv/gen/up16x25-avx.c",
+    "src/f32-dwconv/gen/up8x25-avx-acc2.c",
+    "src/f32-dwconv/gen/up8x25-avx.c",
+    "src/f32-gemm/gen/1x8-avx-broadcast.c",
+    "src/f32-gemm/gen/4x8-avx-broadcast.c",
+    "src/f32-gemm/gen/5x8-avx-broadcast.c",
+    "src/f32-gemm/gen/6x8-avx-broadcast.c",
+    "src/f32-gemm/gen/7x8-avx-broadcast.c",
+    "src/f32-gemm/gen-inc/1x8-avx-broadcast.c",
+    "src/f32-gemm/gen-inc/4x8-avx-broadcast.c",
+    "src/f32-gemm/gen-inc/5x8-avx-broadcast.c",
+    "src/f32-gemm/gen-inc/6x8-avx-broadcast.c",
+    "src/f32-gemm/gen-inc/7x8-avx-broadcast.c",
+    "src/f32-igemm/gen/1x8-avx-broadcast.c",
+    "src/f32-igemm/gen/4x8-avx-broadcast.c",
+    "src/f32-igemm/gen/5x8-avx-broadcast.c",
+    "src/f32-igemm/gen/6x8-avx-broadcast.c",
+    "src/f32-igemm/gen/7x8-avx-broadcast.c",
     "src/f32-rmax/avx.c",
     "src/f32-vscale/avx-unroll32.c",
 ]
 
 FMA3_UKERNELS = [
-    "src/f32-dwconv/up16x4-fma3-acc2.c",
-    "src/f32-dwconv/up16x4-fma3.c",
-    "src/f32-dwconv/up8x4-fma3-acc2.c",
-    "src/f32-dwconv/up8x4-fma3.c",
-    "src/f32-dwconv/up16x9-fma3-acc2.c",
-    "src/f32-dwconv/up16x9-fma3.c",
-    "src/f32-dwconv/up8x9-fma3-acc2.c",
-    "src/f32-dwconv/up8x9-fma3.c",
-    "src/f32-dwconv/up16x25-fma3-acc2.c",
-    "src/f32-dwconv/up16x25-fma3.c",
-    "src/f32-dwconv/up8x25-fma3-acc2.c",
-    "src/f32-dwconv/up8x25-fma3.c",
-    "src/f32-gemm/1x8-fma3-broadcast.c",
-    "src/f32-gemm/4x8-fma3-broadcast.c",
-    "src/f32-gemm/5x8-fma3-broadcast.c",
-    "src/f32-gemm/6x8-fma3-broadcast.c",
-    "src/f32-gemm/7x8-fma3-broadcast.c",
-    "src/f32-gemm/8x8-fma3-broadcast.c",
-    "src/f32-gemminc/1x8-fma3-broadcast.c",
-    "src/f32-gemminc/4x8-fma3-broadcast.c",
-    "src/f32-gemminc/5x8-fma3-broadcast.c",
-    "src/f32-gemminc/6x8-fma3-broadcast.c",
-    "src/f32-gemminc/7x8-fma3-broadcast.c",
-    "src/f32-gemminc/8x8-fma3-broadcast.c",
-    "src/f32-igemm/1x8-fma3-broadcast.c",
-    "src/f32-igemm/4x8-fma3-broadcast.c",
-    "src/f32-igemm/5x8-fma3-broadcast.c",
-    "src/f32-igemm/6x8-fma3-broadcast.c",
-    "src/f32-igemm/7x8-fma3-broadcast.c",
-    "src/f32-igemm/8x8-fma3-broadcast.c",
+    "src/f32-dwconv/gen/up16x4-fma3-acc2.c",
+    "src/f32-dwconv/gen/up16x4-fma3.c",
+    "src/f32-dwconv/gen/up8x4-fma3-acc2.c",
+    "src/f32-dwconv/gen/up8x4-fma3.c",
+    "src/f32-dwconv/gen/up16x9-fma3-acc2.c",
+    "src/f32-dwconv/gen/up16x9-fma3.c",
+    "src/f32-dwconv/gen/up8x9-fma3-acc2.c",
+    "src/f32-dwconv/gen/up8x9-fma3.c",
+    "src/f32-dwconv/gen/up16x25-fma3-acc2.c",
+    "src/f32-dwconv/gen/up16x25-fma3.c",
+    "src/f32-dwconv/gen/up8x25-fma3-acc2.c",
+    "src/f32-dwconv/gen/up8x25-fma3.c",
+    "src/f32-gemm/gen/1x8-fma3-broadcast.c",
+    "src/f32-gemm/gen/4x8-fma3-broadcast.c",
+    "src/f32-gemm/gen/5x8-fma3-broadcast.c",
+    "src/f32-gemm/gen/6x8-fma3-broadcast.c",
+    "src/f32-gemm/gen/7x8-fma3-broadcast.c",
+    "src/f32-gemm/gen/8x8-fma3-broadcast.c",
+    "src/f32-gemm/gen-inc/1x8-fma3-broadcast.c",
+    "src/f32-gemm/gen-inc/4x8-fma3-broadcast.c",
+    "src/f32-gemm/gen-inc/5x8-fma3-broadcast.c",
+    "src/f32-gemm/gen-inc/6x8-fma3-broadcast.c",
+    "src/f32-gemm/gen-inc/7x8-fma3-broadcast.c",
+    "src/f32-gemm/gen-inc/8x8-fma3-broadcast.c",
+    "src/f32-igemm/gen/1x8-fma3-broadcast.c",
+    "src/f32-igemm/gen/4x8-fma3-broadcast.c",
+    "src/f32-igemm/gen/5x8-fma3-broadcast.c",
+    "src/f32-igemm/gen/6x8-fma3-broadcast.c",
+    "src/f32-igemm/gen/7x8-fma3-broadcast.c",
+    "src/f32-igemm/gen/8x8-fma3-broadcast.c",
 ]
 
 AVX2_UKERNELS = [
@@ -682,40 +682,40 @@
 AARCH64_ASM_UKERNELS = [
     "src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S",
     "src/f32-dwconv/up4x9-aarch64-neonfma.S",
-    "src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S",
-    "src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S",
-    "src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemm/4x8-aarch64-neonfma-ld128.S",
-    "src/f32-gemm/4x8-aarch64-neonfma-ld64.S",
-    "src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S",
-    "src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S",
-    "src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemm/6x8-aarch64-neonfma-ld128.S",
-    "src/f32-gemm/6x8-aarch64-neonfma-ld64.S",
-    "src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemminc/1x8-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S",
-    "src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemminc/4x8-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S",
-    "src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemminc/4x8-aarch64-neonfma-ld128.S",
-    "src/f32-gemminc/4x8-aarch64-neonfma-ld64.S",
-    "src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S",
-    "src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S",
-    "src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S",
-    "src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S",
-    "src/f32-gemminc/6x8-aarch64-neonfma-ld128.S",
-    "src/f32-gemminc/6x8-aarch64-neonfma-ld64.S",
+    "src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S",
+    "src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S",
+    "src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S",
+    "src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S",
+    "src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S",
+    "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S",
+    "src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S",
+    "src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S",
+    "src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S",
+    "src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S",
+    "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S",
+    "src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S",
+    "src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S",
+    "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S",
+    "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S",
+    "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S",
+    "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S",
+    "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S",
     "src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S",
     "src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S",
     "src/f32-igemm/1x8-aarch64-neonfma-cortex-a57.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 675d500..e1f2e73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,86 +169,86 @@
   src/f32-argmaxpool/9x-scalar-c1.c
   src/f32-avgpool/mp9p8q-scalar.c
   src/f32-avgpool/up9-scalar.c
-  src/f32-bilinear/scalar-c1.c
-  src/f32-bilinear/scalar-c2.c
-  src/f32-bilinear/scalar-c4.c
+  src/f32-bilinear/gen/scalar-c1.c
+  src/f32-bilinear/gen/scalar-c2.c
+  src/f32-bilinear/gen/scalar-c4.c
   src/f32-clamp/scalar.c
   src/f32-conv-hwc2spchw/3x3s2p1c3x4-scalar-1x1.c
   src/f32-dwconv-spchw/3x3p1-scalar.c
   src/f32-dwconv-spchw/3x3s2p1-scalar.c
   src/f32-dwconv-spchw/5x5p2-scalar.c
   src/f32-dwconv-spchw/5x5s2p2-scalar.c
-  src/f32-dwconv/up1x25-scalar-acc2.c
-  src/f32-dwconv/up1x25-scalar.c
-  src/f32-dwconv/up1x4-scalar-acc2.c
-  src/f32-dwconv/up1x4-scalar.c
-  src/f32-dwconv/up1x9-scalar-acc2.c
-  src/f32-dwconv/up1x9-scalar.c
-  src/f32-dwconv/up2x25-scalar-acc2.c
-  src/f32-dwconv/up2x25-scalar.c
-  src/f32-dwconv/up2x4-scalar-acc2.c
-  src/f32-dwconv/up2x4-scalar.c
-  src/f32-dwconv/up2x9-scalar-acc2.c
-  src/f32-dwconv/up2x9-scalar.c
+  src/f32-dwconv/gen/up1x25-scalar-acc2.c
+  src/f32-dwconv/gen/up1x25-scalar.c
+  src/f32-dwconv/gen/up1x4-scalar-acc2.c
+  src/f32-dwconv/gen/up1x4-scalar.c
+  src/f32-dwconv/gen/up1x9-scalar-acc2.c
+  src/f32-dwconv/gen/up1x9-scalar.c
+  src/f32-dwconv/gen/up2x25-scalar-acc2.c
+  src/f32-dwconv/gen/up2x25-scalar.c
+  src/f32-dwconv/gen/up2x4-scalar-acc2.c
+  src/f32-dwconv/gen/up2x4-scalar.c
+  src/f32-dwconv/gen/up2x9-scalar-acc2.c
+  src/f32-dwconv/gen/up2x9-scalar.c
   src/f32-gavgpool-spchw/scalar-x1.c
   src/f32-gavgpool/mp7p7q-scalar.c
   src/f32-gavgpool/up7-scalar.c
-  src/f32-gemm/1x4-scalar.c
-  src/f32-gemm/2x4-scalar.c
-  src/f32-gemm/4x2-scalar.c
-  src/f32-gemm/4x4-scalar.c
-  src/f32-gemminc/1x4-scalar.c
-  src/f32-gemminc/2x4-scalar.c
-  src/f32-gemminc/4x4-scalar.c
+  src/f32-gemm/gen/1x4-scalar.c
+  src/f32-gemm/gen/2x4-scalar.c
+  src/f32-gemm/gen/4x2-scalar.c
+  src/f32-gemm/gen/4x4-scalar.c
+  src/f32-gemm/gen-inc/1x4-scalar.c
+  src/f32-gemm/gen-inc/2x4-scalar.c
+  src/f32-gemm/gen-inc/4x4-scalar.c
   src/f32-hswish/scalar.c
-  src/f32-igemm/1x4-scalar.c
-  src/f32-igemm/2x4-scalar.c
-  src/f32-igemm/4x2-scalar.c
-  src/f32-igemm/4x4-scalar.c
+  src/f32-igemm/gen/1x4-scalar.c
+  src/f32-igemm/gen/2x4-scalar.c
+  src/f32-igemm/gen/4x2-scalar.c
+  src/f32-igemm/gen/4x4-scalar.c
   src/f32-maxpool/9p8x-scalar-c1.c
   src/f32-pavgpool/mp9p8q-scalar.c
   src/f32-pavgpool/up9-scalar.c
-  src/f32-ppmm/2x4-scalar.c
-  src/f32-ppmm/3x3-scalar.c
-  src/f32-ppmm/4x2-scalar.c
-  src/f32-ppmm/4x4-scalar.c
-  src/f32-prelu/scalar-2x1.c
-  src/f32-prelu/scalar-2x4.c
+  src/f32-ppmm/gen/2x4-scalar.c
+  src/f32-ppmm/gen/3x3-scalar.c
+  src/f32-ppmm/gen/4x2-scalar.c
+  src/f32-ppmm/gen/4x4-scalar.c
+  src/f32-prelu/gen/scalar-2x1.c
+  src/f32-prelu/gen/scalar-2x4.c
   src/f32-rmax/scalar.c
-  src/f32-spmm/1x1-scalar-pipelined.c
-  src/f32-spmm/1x1-scalar.c
-  src/f32-spmm/2x1-scalar-pipelined.c
-  src/f32-spmm/2x1-scalar.c
-  src/f32-spmm/4x1-scalar-pipelined.c
-  src/f32-spmm/4x1-scalar.c
-  src/f32-spmm/8x1-scalar-pipelined.c
-  src/f32-spmm/8x1-scalar.c
-  src/f32-spmm/8x2-scalar.c
-  src/f32-spmm/8x4-scalar.c
-  src/f32-vbinary/vadd-scalar-x1.c
-  src/f32-vbinary/vadd-scalar-x2.c
-  src/f32-vbinary/vadd-scalar-x4.c
-  src/f32-vbinary/vaddc-scalar-x1.c
-  src/f32-vbinary/vaddc-scalar-x2.c
-  src/f32-vbinary/vaddc-scalar-x4.c
-  src/f32-vbinary/vmul-scalar-x1.c
-  src/f32-vbinary/vmul-scalar-x2.c
-  src/f32-vbinary/vmul-scalar-x4.c
-  src/f32-vbinary/vmulc-scalar-x1.c
-  src/f32-vbinary/vmulc-scalar-x2.c
-  src/f32-vbinary/vmulc-scalar-x4.c
-  src/f32-vbinary/vrsubc-scalar-x1.c
-  src/f32-vbinary/vrsubc-scalar-x2.c
-  src/f32-vbinary/vrsubc-scalar-x4.c
-  src/f32-vbinary/vsub-scalar-x1.c
-  src/f32-vbinary/vsub-scalar-x2.c
-  src/f32-vbinary/vsub-scalar-x4.c
-  src/f32-vbinary/vsubc-scalar-x1.c
-  src/f32-vbinary/vsubc-scalar-x2.c
-  src/f32-vbinary/vsubc-scalar-x4.c
-  src/f32-vmulcaddc/c1-scalar-2x.c
-  src/f32-vmulcaddc/c2-scalar-2x.c
-  src/f32-vmulcaddc/c4-scalar-2x.c
+  src/f32-spmm/gen/1x1-scalar-pipelined.c
+  src/f32-spmm/gen/1x1-scalar.c
+  src/f32-spmm/gen/2x1-scalar-pipelined.c
+  src/f32-spmm/gen/2x1-scalar.c
+  src/f32-spmm/gen/4x1-scalar-pipelined.c
+  src/f32-spmm/gen/4x1-scalar.c
+  src/f32-spmm/gen/8x1-scalar-pipelined.c
+  src/f32-spmm/gen/8x1-scalar.c
+  src/f32-spmm/gen/8x2-scalar.c
+  src/f32-spmm/gen/8x4-scalar.c
+  src/f32-vbinary/gen/vadd-scalar-x1.c
+  src/f32-vbinary/gen/vadd-scalar-x2.c
+  src/f32-vbinary/gen/vadd-scalar-x4.c
+  src/f32-vbinary/gen/vaddc-scalar-x1.c
+  src/f32-vbinary/gen/vaddc-scalar-x2.c
+  src/f32-vbinary/gen/vaddc-scalar-x4.c
+  src/f32-vbinary/gen/vmul-scalar-x1.c
+  src/f32-vbinary/gen/vmul-scalar-x2.c
+  src/f32-vbinary/gen/vmul-scalar-x4.c
+  src/f32-vbinary/gen/vmulc-scalar-x1.c
+  src/f32-vbinary/gen/vmulc-scalar-x2.c
+  src/f32-vbinary/gen/vmulc-scalar-x4.c
+  src/f32-vbinary/gen/vrsubc-scalar-x1.c
+  src/f32-vbinary/gen/vrsubc-scalar-x2.c
+  src/f32-vbinary/gen/vrsubc-scalar-x4.c
+  src/f32-vbinary/gen/vsub-scalar-x1.c
+  src/f32-vbinary/gen/vsub-scalar-x2.c
+  src/f32-vbinary/gen/vsub-scalar-x4.c
+  src/f32-vbinary/gen/vsubc-scalar-x1.c
+  src/f32-vbinary/gen/vsubc-scalar-x2.c
+  src/f32-vbinary/gen/vsubc-scalar-x4.c
+  src/f32-vmulcaddc/gen/c1-scalar-2x.c
+  src/f32-vmulcaddc/gen/c2-scalar-2x.c
+  src/f32-vmulcaddc/gen/c4-scalar-2x.c
   src/q8-avgpool/mp9p8q-scalar.c
   src/q8-avgpool/up9-scalar.c
   src/q8-dwconv/up1x9-scalar.c
@@ -282,74 +282,74 @@
   src/f32-argmaxpool/9x-psimd-c4.c
   src/f32-avgpool/mp9p8q-psimd.c
   src/f32-avgpool/up9-psimd.c
-  src/f32-bilinear/psimd-c4.c
-  src/f32-bilinear/psimd-c8.c
+  src/f32-bilinear/gen/psimd-c4.c
+  src/f32-bilinear/gen/psimd-c8.c
   src/f32-clamp/psimd.c
-  src/f32-dwconv/up4x25-psimd-acc2.c
-  src/f32-dwconv/up4x25-psimd.c
-  src/f32-dwconv/up4x4-psimd-acc2.c
-  src/f32-dwconv/up4x4-psimd.c
-  src/f32-dwconv/up4x9-psimd-acc2.c
-  src/f32-dwconv/up4x9-psimd.c
-  src/f32-dwconv/up8x25-psimd-acc2.c
-  src/f32-dwconv/up8x25-psimd.c
-  src/f32-dwconv/up8x4-psimd-acc2.c
-  src/f32-dwconv/up8x4-psimd.c
-  src/f32-dwconv/up8x9-psimd-acc2.c
-  src/f32-dwconv/up8x9-psimd.c
+  src/f32-dwconv/gen/up4x25-psimd-acc2.c
+  src/f32-dwconv/gen/up4x25-psimd.c
+  src/f32-dwconv/gen/up4x4-psimd-acc2.c
+  src/f32-dwconv/gen/up4x4-psimd.c
+  src/f32-dwconv/gen/up4x9-psimd-acc2.c
+  src/f32-dwconv/gen/up4x9-psimd.c
+  src/f32-dwconv/gen/up8x25-psimd-acc2.c
+  src/f32-dwconv/gen/up8x25-psimd.c
+  src/f32-dwconv/gen/up8x4-psimd-acc2.c
+  src/f32-dwconv/gen/up8x4-psimd.c
+  src/f32-dwconv/gen/up8x9-psimd-acc2.c
+  src/f32-dwconv/gen/up8x9-psimd.c
   src/f32-gavgpool/mp7p7q-psimd.c
   src/f32-gavgpool/up7-psimd.c
-  src/f32-gemm/1x8-psimd-loadsplat.c
-  src/f32-gemm/1x8-psimd-splat.c
-  src/f32-gemm/1x8s4-psimd.c
-  src/f32-gemm/4x8-psimd-loadsplat.c
-  src/f32-gemm/4x8-psimd-splat.c
-  src/f32-gemm/4x8s4-psimd.c
-  src/f32-gemm/6x8-psimd-loadsplat.c
-  src/f32-gemm/6x8-psimd-splat.c
-  src/f32-gemm/6x8s4-psimd.c
-  src/f32-gemminc/1x8-psimd-loadsplat.c
-  src/f32-gemminc/1x8-psimd-splat.c
-  src/f32-gemminc/1x8s4-psimd.c
-  src/f32-gemminc/4x8-psimd-loadsplat.c
-  src/f32-gemminc/4x8-psimd-splat.c
-  src/f32-gemminc/4x8s4-psimd.c
-  src/f32-gemminc/6x8-psimd-loadsplat.c
-  src/f32-gemminc/6x8-psimd-splat.c
-  src/f32-gemminc/6x8s4-psimd.c
+  src/f32-gemm/gen/1x8-psimd-loadsplat.c
+  src/f32-gemm/gen/1x8-psimd-splat.c
+  src/f32-gemm/gen/1x8s4-psimd.c
+  src/f32-gemm/gen/4x8-psimd-loadsplat.c
+  src/f32-gemm/gen/4x8-psimd-splat.c
+  src/f32-gemm/gen/4x8s4-psimd.c
+  src/f32-gemm/gen/6x8-psimd-loadsplat.c
+  src/f32-gemm/gen/6x8-psimd-splat.c
+  src/f32-gemm/gen/6x8s4-psimd.c
+  src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c
+  src/f32-gemm/gen-inc/1x8-psimd-splat.c
+  src/f32-gemm/gen-inc/1x8s4-psimd.c
+  src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c
+  src/f32-gemm/gen-inc/4x8-psimd-splat.c
+  src/f32-gemm/gen-inc/4x8s4-psimd.c
+  src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c
+  src/f32-gemm/gen-inc/6x8-psimd-splat.c
+  src/f32-gemm/gen-inc/6x8s4-psimd.c
   src/f32-hswish/psimd.c
-  src/f32-igemm/1x8-psimd-loadsplat.c
-  src/f32-igemm/1x8-psimd-splat.c
-  src/f32-igemm/1x8s4-psimd.c
-  src/f32-igemm/4x2c4-psimd.c
-  src/f32-igemm/4x8-psimd-loadsplat.c
-  src/f32-igemm/4x8-psimd-splat.c
-  src/f32-igemm/4x8s4-psimd.c
-  src/f32-igemm/6x8-psimd-loadsplat.c
-  src/f32-igemm/6x8-psimd-splat.c
-  src/f32-igemm/6x8s4-psimd.c
+  src/f32-igemm/gen/1x8-psimd-loadsplat.c
+  src/f32-igemm/gen/1x8-psimd-splat.c
+  src/f32-igemm/gen/1x8s4-psimd.c
+  src/f32-igemm/gen/4x2c4-psimd.c
+  src/f32-igemm/gen/4x8-psimd-loadsplat.c
+  src/f32-igemm/gen/4x8-psimd-splat.c
+  src/f32-igemm/gen/4x8s4-psimd.c
+  src/f32-igemm/gen/6x8-psimd-loadsplat.c
+  src/f32-igemm/gen/6x8-psimd-splat.c
+  src/f32-igemm/gen/6x8s4-psimd.c
   src/f32-maxpool/9p8x-psimd-c4.c
   src/f32-pavgpool/mp9p8q-psimd.c
   src/f32-pavgpool/up9-psimd.c
-  src/f32-ppmm/4x8-psimd.c
-  src/f32-prelu/psimd-2x4.c
-  src/f32-prelu/psimd-2x8.c
-  src/f32-vbinary/vadd-psimd-x4.c
-  src/f32-vbinary/vadd-psimd-x8.c
-  src/f32-vbinary/vaddc-psimd-x4.c
-  src/f32-vbinary/vaddc-psimd-x8.c
-  src/f32-vbinary/vmul-psimd-x4.c
-  src/f32-vbinary/vmul-psimd-x8.c
-  src/f32-vbinary/vmulc-psimd-x4.c
-  src/f32-vbinary/vmulc-psimd-x8.c
-  src/f32-vbinary/vrsubc-psimd-x4.c
-  src/f32-vbinary/vrsubc-psimd-x8.c
-  src/f32-vbinary/vsub-psimd-x4.c
-  src/f32-vbinary/vsub-psimd-x8.c
-  src/f32-vbinary/vsubc-psimd-x4.c
-  src/f32-vbinary/vsubc-psimd-x8.c
-  src/f32-vmulcaddc/c4-psimd-2x.c
-  src/f32-vmulcaddc/c8-psimd-2x.c
+  src/f32-ppmm/gen/4x8-psimd.c
+  src/f32-prelu/gen/psimd-2x4.c
+  src/f32-prelu/gen/psimd-2x8.c
+  src/f32-vbinary/gen/vadd-psimd-x4.c
+  src/f32-vbinary/gen/vadd-psimd-x8.c
+  src/f32-vbinary/gen/vaddc-psimd-x4.c
+  src/f32-vbinary/gen/vaddc-psimd-x8.c
+  src/f32-vbinary/gen/vmul-psimd-x4.c
+  src/f32-vbinary/gen/vmul-psimd-x8.c
+  src/f32-vbinary/gen/vmulc-psimd-x4.c
+  src/f32-vbinary/gen/vmulc-psimd-x8.c
+  src/f32-vbinary/gen/vrsubc-psimd-x4.c
+  src/f32-vbinary/gen/vrsubc-psimd-x8.c
+  src/f32-vbinary/gen/vsub-psimd-x4.c
+  src/f32-vbinary/gen/vsub-psimd-x8.c
+  src/f32-vbinary/gen/vsubc-psimd-x4.c
+  src/f32-vbinary/gen/vsubc-psimd-x8.c
+  src/f32-vmulcaddc/gen/c4-psimd-2x.c
+  src/f32-vmulcaddc/gen/c8-psimd-2x.c
   src/x32-packx/x4-psimd.c
   src/x32-pad/x2-psimd.c
   src/x32-unpool/psimd.c
@@ -361,82 +361,82 @@
 SET(XNNPACK_NEON_MICROKERNEL_SRCS
   src/f32-avgpool/mp9p8q-neon.c
   src/f32-avgpool/up9-neon.c
-  src/f32-bilinear/neon-c4.c
-  src/f32-bilinear/neon-c8.c
+  src/f32-bilinear/gen/neon-c4.c
+  src/f32-bilinear/gen/neon-c8.c
   src/f32-clamp/neon.c
-  src/f32-dwconv/up4x9-neon.c
-  src/f32-dwconv/up4x9-neon-acc2.c
-  src/f32-dwconv/up8x9-neon.c
-  src/f32-dwconv/up8x9-neon-acc2.c
+  src/f32-dwconv/gen/up4x9-neon.c
+  src/f32-dwconv/gen/up4x9-neon-acc2.c
+  src/f32-dwconv/gen/up8x9-neon.c
+  src/f32-dwconv/gen/up8x9-neon-acc2.c
   src/f32-gavgpool-spchw/neon-x4.c
   src/f32-gavgpool/mp7p7q-neon.c
   src/f32-gavgpool/up7-neon.c
-  src/f32-gemm/1x8-neon-lane-ld64.c
-  src/f32-gemm/4x2-neon-lane-ld64.c
-  src/f32-gemm/4x8-neon-lane-ld128.c
-  src/f32-gemm/4x8-neon-lane-ld64.c
-  src/f32-gemm/5x8-neon-lane-ld64.c
-  src/f32-gemm/6x8-neon-lane-ld64.c
-  src/f32-gemm/1x8-neon-dup-ld64.c
-  src/f32-gemm/4x8-neon-dup-ld128.c
-  src/f32-gemm/4x8-neon-dup-ld64.c
-  src/f32-gemm/6x8-neon-dup-ld64.c
-  src/f32-gemm/1x8s4-neon.c
-  src/f32-gemm/4x8s4-neon.c
-  src/f32-gemm/6x8s4-neon.c
-  src/f32-gemm/8x8s4-neon.c
-  src/f32-gemminc/1x8-neon-lane-ld64.c
-  src/f32-gemminc/4x8-neon-lane-ld128.c
-  src/f32-gemminc/4x8-neon-lane-ld64.c
-  src/f32-gemminc/5x8-neon-lane-ld64.c
-  src/f32-gemminc/6x8-neon-lane-ld64.c
-  src/f32-gemminc/1x8-neon-dup-ld64.c
-  src/f32-gemminc/4x8-neon-dup-ld128.c
-  src/f32-gemminc/4x8-neon-dup-ld64.c
-  src/f32-gemminc/6x8-neon-dup-ld64.c
-  src/f32-gemminc/1x8s4-neon.c
-  src/f32-gemminc/4x8s4-neon.c
-  src/f32-gemminc/6x8s4-neon.c
-  src/f32-gemminc/8x8s4-neon.c
+  src/f32-gemm/gen/1x8-neon-lane-ld64.c
+  src/f32-gemm/gen/4x2-neon-lane-ld64.c
+  src/f32-gemm/gen/4x8-neon-lane-ld128.c
+  src/f32-gemm/gen/4x8-neon-lane-ld64.c
+  src/f32-gemm/gen/5x8-neon-lane-ld64.c
+  src/f32-gemm/gen/6x8-neon-lane-ld64.c
+  src/f32-gemm/gen/1x8-neon-dup-ld64.c
+  src/f32-gemm/gen/4x8-neon-dup-ld128.c
+  src/f32-gemm/gen/4x8-neon-dup-ld64.c
+  src/f32-gemm/gen/6x8-neon-dup-ld64.c
+  src/f32-gemm/gen/1x8s4-neon.c
+  src/f32-gemm/gen/4x8s4-neon.c
+  src/f32-gemm/gen/6x8s4-neon.c
+  src/f32-gemm/gen/8x8s4-neon.c
+  src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c
+  src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c
+  src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c
+  src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c
+  src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c
+  src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c
+  src/f32-gemm/gen-inc/1x8s4-neon.c
+  src/f32-gemm/gen-inc/4x8s4-neon.c
+  src/f32-gemm/gen-inc/6x8s4-neon.c
+  src/f32-gemm/gen-inc/8x8s4-neon.c
   src/f32-hswish/neon.c
-  src/f32-igemm/1x8-neon-lane-ld64.c
-  src/f32-igemm/4x2-neon-lane-ld64.c
-  src/f32-igemm/4x4-neon-lane-ld64.c
-  src/f32-igemm/4x8-neon-lane-ld128.c
-  src/f32-igemm/4x8-neon-lane-ld64.c
-  src/f32-igemm/6x8-neon-lane-ld64.c
-  src/f32-igemm/1x8-neon-dup-ld64.c
-  src/f32-igemm/4x8-neon-dup-ld128.c
-  src/f32-igemm/4x8-neon-dup-ld64.c
-  src/f32-igemm/6x8-neon-dup-ld64.c
-  src/f32-igemm/1x8s4-neon.c
-  src/f32-igemm/4x8s4-neon.c
-  src/f32-igemm/6x8s4-neon.c
-  src/f32-igemm/8x8s4-neon.c
+  src/f32-igemm/gen/1x8-neon-lane-ld64.c
+  src/f32-igemm/gen/4x2-neon-lane-ld64.c
+  src/f32-igemm/gen/4x4-neon-lane-ld64.c
+  src/f32-igemm/gen/4x8-neon-lane-ld128.c
+  src/f32-igemm/gen/4x8-neon-lane-ld64.c
+  src/f32-igemm/gen/6x8-neon-lane-ld64.c
+  src/f32-igemm/gen/1x8-neon-dup-ld64.c
+  src/f32-igemm/gen/4x8-neon-dup-ld128.c
+  src/f32-igemm/gen/4x8-neon-dup-ld64.c
+  src/f32-igemm/gen/6x8-neon-dup-ld64.c
+  src/f32-igemm/gen/1x8s4-neon.c
+  src/f32-igemm/gen/4x8s4-neon.c
+  src/f32-igemm/gen/6x8s4-neon.c
+  src/f32-igemm/gen/8x8s4-neon.c
   src/f32-pavgpool/mp9p8q-neon.c
   src/f32-pavgpool/up9-neon.c
-  src/f32-ppmm/4x8-neon.c
-  src/f32-ppmm/8x8-neon.c
-  src/f32-prelu/neon-2x4.c
-  src/f32-prelu/neon-2x8.c
+  src/f32-ppmm/gen/4x8-neon.c
+  src/f32-ppmm/gen/8x8-neon.c
+  src/f32-prelu/gen/neon-2x4.c
+  src/f32-prelu/gen/neon-2x8.c
   src/f32-rmax/neon.c
-  src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
-  src/f32-vbinary/vadd-neon-x4.c
-  src/f32-vbinary/vadd-neon-x8.c
-  src/f32-vbinary/vaddc-neon-x4.c
-  src/f32-vbinary/vaddc-neon-x8.c
-  src/f32-vbinary/vmul-neon-x4.c
-  src/f32-vbinary/vmul-neon-x8.c
-  src/f32-vbinary/vmulc-neon-x4.c
-  src/f32-vbinary/vmulc-neon-x8.c
-  src/f32-vbinary/vrsubc-neon-x4.c
-  src/f32-vbinary/vrsubc-neon-x8.c
-  src/f32-vbinary/vsub-neon-x4.c
-  src/f32-vbinary/vsub-neon-x8.c
-  src/f32-vbinary/vsubc-neon-x4.c
-  src/f32-vbinary/vsubc-neon-x8.c
-  src/f32-vmulcaddc/c4-neon-2x.c
-  src/f32-vmulcaddc/c8-neon-2x.c
+  src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
+  src/f32-vbinary/gen/vadd-neon-x4.c
+  src/f32-vbinary/gen/vadd-neon-x8.c
+  src/f32-vbinary/gen/vaddc-neon-x4.c
+  src/f32-vbinary/gen/vaddc-neon-x8.c
+  src/f32-vbinary/gen/vmul-neon-x4.c
+  src/f32-vbinary/gen/vmul-neon-x8.c
+  src/f32-vbinary/gen/vmulc-neon-x4.c
+  src/f32-vbinary/gen/vmulc-neon-x8.c
+  src/f32-vbinary/gen/vrsubc-neon-x4.c
+  src/f32-vbinary/gen/vrsubc-neon-x8.c
+  src/f32-vbinary/gen/vsub-neon-x4.c
+  src/f32-vbinary/gen/vsub-neon-x8.c
+  src/f32-vbinary/gen/vsubc-neon-x4.c
+  src/f32-vbinary/gen/vsubc-neon-x8.c
+  src/f32-vmulcaddc/gen/c4-neon-2x.c
+  src/f32-vmulcaddc/gen/c8-neon-2x.c
   src/q8-avgpool/mp9p8q-neon.c
   src/q8-avgpool/up9-neon.c
   src/q8-dwconv/up8x9-neon.c
@@ -462,42 +462,42 @@
   src/x8-zip/xm-neon.c)
 
 SET(XNNPACK_NEONFMA_MICROKERNEL_SRCS
-  src/f32-bilinear/neonfma-c4.c
-  src/f32-bilinear/neonfma-c8.c
-  src/f32-igemm/1x8-neonfma-dup-ld64.c
-  src/f32-igemm/4x8-neonfma-dup-ld128.c
-  src/f32-igemm/4x8-neonfma-dup-ld64.c
-  src/f32-igemm/6x8-neonfma-dup-ld64.c
-  src/f32-igemm/1x8s4-neonfma.c
-  src/f32-igemm/4x8s4-neonfma.c
-  src/f32-igemm/6x8s4-neonfma.c
-  src/f32-igemm/8x8s4-neonfma.c
-  src/f32-dwconv/up4x9-neonfma.c
-  src/f32-dwconv/up4x9-neonfma-acc2.c
-  src/f32-dwconv/up8x9-neonfma.c
-  src/f32-dwconv/up8x9-neonfma-acc2.c
-  src/f32-gemm/1x8-neonfma-dup-ld64.c
-  src/f32-gemm/4x8-neonfma-dup-ld128.c
-  src/f32-gemm/4x8-neonfma-dup-ld64.c
-  src/f32-gemm/6x8-neonfma-dup-ld64.c
-  src/f32-gemm/1x8s4-neonfma.c
-  src/f32-gemm/4x8s4-neonfma.c
-  src/f32-gemm/6x8s4-neonfma.c
-  src/f32-gemm/8x8s4-neonfma.c
-  src/f32-gemminc/1x8-neonfma-dup-ld64.c
-  src/f32-gemminc/4x8-neonfma-dup-ld128.c
-  src/f32-gemminc/4x8-neonfma-dup-ld64.c
-  src/f32-gemminc/6x8-neonfma-dup-ld64.c
-  src/f32-gemminc/1x8s4-neonfma.c
-  src/f32-gemminc/4x8s4-neonfma.c
-  src/f32-gemminc/6x8s4-neonfma.c
-  src/f32-gemminc/8x8s4-neonfma.c
+  src/f32-bilinear/gen/neonfma-c4.c
+  src/f32-bilinear/gen/neonfma-c8.c
+  src/f32-igemm/gen/1x8-neonfma-dup-ld64.c
+  src/f32-igemm/gen/4x8-neonfma-dup-ld128.c
+  src/f32-igemm/gen/4x8-neonfma-dup-ld64.c
+  src/f32-igemm/gen/6x8-neonfma-dup-ld64.c
+  src/f32-igemm/gen/1x8s4-neonfma.c
+  src/f32-igemm/gen/4x8s4-neonfma.c
+  src/f32-igemm/gen/6x8s4-neonfma.c
+  src/f32-igemm/gen/8x8s4-neonfma.c
+  src/f32-dwconv/gen/up4x9-neonfma.c
+  src/f32-dwconv/gen/up4x9-neonfma-acc2.c
+  src/f32-dwconv/gen/up8x9-neonfma.c
+  src/f32-dwconv/gen/up8x9-neonfma-acc2.c
+  src/f32-gemm/gen/1x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen/4x8-neonfma-dup-ld128.c
+  src/f32-gemm/gen/4x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen/6x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen/1x8s4-neonfma.c
+  src/f32-gemm/gen/4x8s4-neonfma.c
+  src/f32-gemm/gen/6x8s4-neonfma.c
+  src/f32-gemm/gen/8x8s4-neonfma.c
+  src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c
+  src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c
+  src/f32-gemm/gen-inc/1x8s4-neonfma.c
+  src/f32-gemm/gen-inc/4x8s4-neonfma.c
+  src/f32-gemm/gen-inc/6x8s4-neonfma.c
+  src/f32-gemm/gen-inc/8x8s4-neonfma.c
   src/f32-hswish/neonfma.c
-  src/f32-ppmm/4x8-neonfma.c
-  src/f32-ppmm/8x8-neonfma.c
-  src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
-  src/f32-vmulcaddc/c4-neonfma-2x.c
-  src/f32-vmulcaddc/c8-neonfma-2x.c
+  src/f32-ppmm/gen/4x8-neonfma.c
+  src/f32-ppmm/gen/8x8-neonfma.c
+  src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
+  src/f32-vmulcaddc/gen/c4-neonfma-2x.c
+  src/f32-vmulcaddc/gen/c8-neonfma-2x.c
   src/math/exp-neonfma-lut64-p2.c
   src/math/exp-neonfma-p5.c
   src/math/expminus-neonfma-p5.c
@@ -506,23 +506,23 @@
   src/math/sigmoid-neonfma-p5-nr2recps.c)
 
 SET(XNNPACK_AARCH64_NEONFMA_MICROKERNEL_SRCS
-  src/f32-gemm/1x8-neonfma-lane-ld64.c
-  src/f32-gemm/4x2-neonfma-lane-ld64.c
-  src/f32-gemm/4x8-neonfma-lane-ld128.c
-  src/f32-gemm/4x8-neonfma-lane-ld64.c
-  src/f32-gemm/5x8-neonfma-lane-ld64.c
-  src/f32-gemm/6x8-neonfma-lane-ld64.c
-  src/f32-gemminc/1x8-neonfma-lane-ld64.c
-  src/f32-gemminc/4x8-neonfma-lane-ld128.c
-  src/f32-gemminc/4x8-neonfma-lane-ld64.c
-  src/f32-gemminc/5x8-neonfma-lane-ld64.c
-  src/f32-gemminc/6x8-neonfma-lane-ld64.c
-  src/f32-igemm/1x8-neonfma-lane-ld64.c
-  src/f32-igemm/4x2-neonfma-lane-ld64.c
-  src/f32-igemm/4x4-neonfma-lane-ld64.c
-  src/f32-igemm/4x8-neonfma-lane-ld128.c
-  src/f32-igemm/4x8-neonfma-lane-ld64.c
-  src/f32-igemm/6x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/1x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/4x2-neonfma-lane-ld64.c
+  src/f32-gemm/gen/4x8-neonfma-lane-ld128.c
+  src/f32-gemm/gen/4x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/5x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen/6x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c
+  src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c
+  src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c
+  src/f32-igemm/gen/1x8-neonfma-lane-ld64.c
+  src/f32-igemm/gen/4x2-neonfma-lane-ld64.c
+  src/f32-igemm/gen/4x4-neonfma-lane-ld64.c
+  src/f32-igemm/gen/4x8-neonfma-lane-ld128.c
+  src/f32-igemm/gen/4x8-neonfma-lane-ld64.c
+  src/f32-igemm/gen/6x8-neonfma-lane-ld64.c
   src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c
   src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c
   src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
@@ -530,107 +530,107 @@
   src/f32-dwconv-spchw/5x5p2-neonfma.c
   src/f32-dwconv-spchw/3x3s2p1-neonfma.c
   src/f32-dwconv-spchw/5x5s2p2-neonfma.c
-  src/f32-spmm/12x1-neonfma.c
-  src/f32-spmm/12x2-neonfma.c
-  src/f32-spmm/12x4-neonfma.c
-  src/f32-spmm/16x1-neonfma-pipelined.c
-  src/f32-spmm/16x1-neonfma-unroll2.c
-  src/f32-spmm/16x1-neonfma.c
-  src/f32-spmm/16x2-neonfma.c
-  src/f32-spmm/16x4-neonfma.c
-  src/f32-spmm/4x1-neonfma-pipelined.c
-  src/f32-spmm/4x1-neonfma-unroll2.c
-  src/f32-spmm/4x1-neonfma.c
-  src/f32-spmm/4x2-neonfma.c
-  src/f32-spmm/4x4-neonfma.c
-  src/f32-spmm/8x1-neonfma-pipelined.c
-  src/f32-spmm/8x1-neonfma-unroll2.c
-  src/f32-spmm/8x1-neonfma.c
-  src/f32-spmm/8x2-neonfma.c
-  src/f32-spmm/8x4-neonfma.c
+  src/f32-spmm/gen/12x1-neonfma.c
+  src/f32-spmm/gen/12x2-neonfma.c
+  src/f32-spmm/gen/12x4-neonfma.c
+  src/f32-spmm/gen/16x1-neonfma-pipelined.c
+  src/f32-spmm/gen/16x1-neonfma-unroll2.c
+  src/f32-spmm/gen/16x1-neonfma.c
+  src/f32-spmm/gen/16x2-neonfma.c
+  src/f32-spmm/gen/16x4-neonfma.c
+  src/f32-spmm/gen/4x1-neonfma-pipelined.c
+  src/f32-spmm/gen/4x1-neonfma-unroll2.c
+  src/f32-spmm/gen/4x1-neonfma.c
+  src/f32-spmm/gen/4x2-neonfma.c
+  src/f32-spmm/gen/4x4-neonfma.c
+  src/f32-spmm/gen/8x1-neonfma-pipelined.c
+  src/f32-spmm/gen/8x1-neonfma-unroll2.c
+  src/f32-spmm/gen/8x1-neonfma.c
+  src/f32-spmm/gen/8x2-neonfma.c
+  src/f32-spmm/gen/8x4-neonfma.c
   src/math/sigmoid-neonfma-p5-div.c)
 
 SET(XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
-  src/f16-gemm/4x8-neonfp16arith-ld64.c
-  src/f16-gemm/6x8-neonfp16arith-ld64.c
-  src/f16-gemm/8x8-neonfp16arith-ld64.c)
+  src/f16-gemm/gen/4x8-neonfp16arith-ld64.c
+  src/f16-gemm/gen/6x8-neonfp16arith-ld64.c
+  src/f16-gemm/gen/8x8-neonfp16arith-ld64.c)
 
 SET(XNNPACK_SSE_MICROKERNEL_SRCS
   src/f32-avgpool/mp9p8q-sse.c
   src/f32-avgpool/up9-sse.c
-  src/f32-bilinear/sse-c4.c
-  src/f32-bilinear/sse-c8.c
+  src/f32-bilinear/gen/sse-c4.c
+  src/f32-bilinear/gen/sse-c8.c
   src/f32-clamp/sse.c
   src/f32-dwconv-spchw/3x3p1-sse.c
   src/f32-dwconv-spchw/3x3s2p1-sse.c
-  src/f32-dwconv/up4x25-sse-acc2.c
-  src/f32-dwconv/up4x25-sse.c
-  src/f32-dwconv/up4x4-sse-acc2.c
-  src/f32-dwconv/up4x4-sse.c
-  src/f32-dwconv/up4x9-sse-acc2.c
-  src/f32-dwconv/up4x9-sse.c
-  src/f32-dwconv/up8x25-sse-acc2.c
-  src/f32-dwconv/up8x25-sse.c
-  src/f32-dwconv/up8x4-sse-acc2.c
-  src/f32-dwconv/up8x4-sse.c
-  src/f32-dwconv/up8x9-sse-acc2.c
-  src/f32-dwconv/up8x9-sse.c
+  src/f32-dwconv/gen/up4x25-sse-acc2.c
+  src/f32-dwconv/gen/up4x25-sse.c
+  src/f32-dwconv/gen/up4x4-sse-acc2.c
+  src/f32-dwconv/gen/up4x4-sse.c
+  src/f32-dwconv/gen/up4x9-sse-acc2.c
+  src/f32-dwconv/gen/up4x9-sse.c
+  src/f32-dwconv/gen/up8x25-sse-acc2.c
+  src/f32-dwconv/gen/up8x25-sse.c
+  src/f32-dwconv/gen/up8x4-sse-acc2.c
+  src/f32-dwconv/gen/up8x4-sse.c
+  src/f32-dwconv/gen/up8x9-sse-acc2.c
+  src/f32-dwconv/gen/up8x9-sse.c
   src/f32-gavgpool-spchw/sse-x4.c
   src/f32-gavgpool/mp7p7q-sse.c
   src/f32-gavgpool/up7-sse.c
-  src/f32-gemm/1x8-sse-dup.c
-  src/f32-gemm/1x8-sse-load1.c
-  src/f32-gemm/1x8s4-sse.c
-  src/f32-gemm/4x8-sse-dup.c
-  src/f32-gemm/4x8-sse-load1.c
-  src/f32-gemm/4x8s4-sse.c
-  src/f32-gemminc/1x8-sse-dup.c
-  src/f32-gemminc/1x8-sse-load1.c
-  src/f32-gemminc/1x8s4-sse.c
-  src/f32-gemminc/4x8-sse-dup.c
-  src/f32-gemminc/4x8-sse-load1.c
-  src/f32-gemminc/4x8s4-sse.c
+  src/f32-gemm/gen/1x8-sse-dup.c
+  src/f32-gemm/gen/1x8-sse-load1.c
+  src/f32-gemm/gen/1x8s4-sse.c
+  src/f32-gemm/gen/4x8-sse-dup.c
+  src/f32-gemm/gen/4x8-sse-load1.c
+  src/f32-gemm/gen/4x8s4-sse.c
+  src/f32-gemm/gen-inc/1x8-sse-dup.c
+  src/f32-gemm/gen-inc/1x8-sse-load1.c
+  src/f32-gemm/gen-inc/1x8s4-sse.c
+  src/f32-gemm/gen-inc/4x8-sse-dup.c
+  src/f32-gemm/gen-inc/4x8-sse-load1.c
+  src/f32-gemm/gen-inc/4x8s4-sse.c
   src/f32-hswish/sse.c
-  src/f32-igemm/1x8-sse-dup.c
-  src/f32-igemm/1x8-sse-load1.c
-  src/f32-igemm/1x8s4-sse.c
-  src/f32-igemm/4x2c4-sse.c
-  src/f32-igemm/4x8-sse-dup.c
-  src/f32-igemm/4x8-sse-load1.c
-  src/f32-igemm/4x8s4-sse.c
+  src/f32-igemm/gen/1x8-sse-dup.c
+  src/f32-igemm/gen/1x8-sse-load1.c
+  src/f32-igemm/gen/1x8s4-sse.c
+  src/f32-igemm/gen/4x2c4-sse.c
+  src/f32-igemm/gen/4x8-sse-dup.c
+  src/f32-igemm/gen/4x8-sse-load1.c
+  src/f32-igemm/gen/4x8s4-sse.c
   src/f32-maxpool/9p8x-sse-c4.c
   src/f32-pavgpool/mp9p8q-sse.c
   src/f32-pavgpool/up9-sse.c
-  src/f32-ppmm/4x8-sse.c
+  src/f32-ppmm/gen/4x8-sse.c
   src/f32-rmax/sse.c
-  src/f32-spmm/4x1-sse.c
-  src/f32-spmm/8x1-sse.c
-  src/f32-vbinary/vadd-sse-x4.c
-  src/f32-vbinary/vadd-sse-x8.c
-  src/f32-vbinary/vaddc-sse-x4.c
-  src/f32-vbinary/vaddc-sse-x8.c
-  src/f32-vbinary/vmul-sse-x4.c
-  src/f32-vbinary/vmul-sse-x8.c
-  src/f32-vbinary/vmulc-sse-x4.c
-  src/f32-vbinary/vmulc-sse-x8.c
-  src/f32-vbinary/vrsubc-sse-x4.c
-  src/f32-vbinary/vrsubc-sse-x8.c
-  src/f32-vbinary/vsub-sse-x4.c
-  src/f32-vbinary/vsub-sse-x8.c
-  src/f32-vbinary/vsubc-sse-x4.c
-  src/f32-vbinary/vsubc-sse-x8.c
-  src/f32-vmulcaddc/c4-sse-2x.c
-  src/f32-vmulcaddc/c8-sse-2x.c
+  src/f32-spmm/gen/4x1-sse.c
+  src/f32-spmm/gen/8x1-sse.c
+  src/f32-vbinary/gen/vadd-sse-x4.c
+  src/f32-vbinary/gen/vadd-sse-x8.c
+  src/f32-vbinary/gen/vaddc-sse-x4.c
+  src/f32-vbinary/gen/vaddc-sse-x8.c
+  src/f32-vbinary/gen/vmul-sse-x4.c
+  src/f32-vbinary/gen/vmul-sse-x8.c
+  src/f32-vbinary/gen/vmulc-sse-x4.c
+  src/f32-vbinary/gen/vmulc-sse-x8.c
+  src/f32-vbinary/gen/vrsubc-sse-x4.c
+  src/f32-vbinary/gen/vrsubc-sse-x8.c
+  src/f32-vbinary/gen/vsub-sse-x4.c
+  src/f32-vbinary/gen/vsub-sse-x8.c
+  src/f32-vbinary/gen/vsubc-sse-x4.c
+  src/f32-vbinary/gen/vsubc-sse-x8.c
+  src/f32-vmulcaddc/gen/c4-sse-2x.c
+  src/f32-vmulcaddc/gen/c8-sse-2x.c
   src/x32-packx/x4-sse.c)
 
 SET(XNNPACK_SSE2_MICROKERNEL_SRCS
   src/f32-argmaxpool/4x-sse2-c4.c
   src/f32-argmaxpool/9p8x-sse2-c4.c
   src/f32-argmaxpool/9x-sse2-c4.c
-  src/f32-prelu/sse2-2x4.c
-  src/f32-prelu/sse2-2x8.c
-  src/f32-sigmoid/sse2-p5-div-x8.c
-  src/f32-sigmoid/sse2-p5-div-x16.c
+  src/f32-prelu/gen/sse2-2x4.c
+  src/f32-prelu/gen/sse2-2x8.c
+  src/f32-sigmoid/gen/sse2-p5-div-x8.c
+  src/f32-sigmoid/gen/sse2-p5-div-x16.c
   src/q8-avgpool/mp9p8q-sse2.c
   src/q8-avgpool/up9-sse2.c
   src/q8-igemm/4x4c2-sse2.c
@@ -657,73 +657,73 @@
   src/math/sigmoid-sse2-p5-div.c)
 
 SET(XNNPACK_SSE41_MICROKERNEL_SRCS
-  src/f32-prelu/sse41-2x4.c
-  src/f32-prelu/sse41-2x8.c
-  src/f32-sigmoid/sse41-p5-div-x8.c
-  src/f32-sigmoid/sse41-p5-div-x16.c)
+  src/f32-prelu/gen/sse41-2x4.c
+  src/f32-prelu/gen/sse41-2x8.c
+  src/f32-sigmoid/gen/sse41-p5-div-x8.c
+  src/f32-sigmoid/gen/sse41-p5-div-x16.c)
 
 SET(XNNPACK_AVX_MICROKERNEL_SRCS
-  src/f32-dwconv/up16x4-avx-acc2.c
-  src/f32-dwconv/up16x4-avx.c
-  src/f32-dwconv/up8x4-avx-acc2.c
-  src/f32-dwconv/up8x4-avx.c
-  src/f32-dwconv/up16x9-avx-acc2.c
-  src/f32-dwconv/up16x9-avx.c
-  src/f32-dwconv/up8x9-avx-acc2.c
-  src/f32-dwconv/up8x9-avx.c
-  src/f32-dwconv/up16x25-avx-acc2.c
-  src/f32-dwconv/up16x25-avx.c
-  src/f32-dwconv/up8x25-avx-acc2.c
-  src/f32-dwconv/up8x25-avx.c
-  src/f32-gemm/1x8-avx-broadcast.c
-  src/f32-gemm/4x8-avx-broadcast.c
-  src/f32-gemm/5x8-avx-broadcast.c
-  src/f32-gemm/6x8-avx-broadcast.c
-  src/f32-gemm/7x8-avx-broadcast.c
-  src/f32-gemminc/1x8-avx-broadcast.c
-  src/f32-gemminc/4x8-avx-broadcast.c
-  src/f32-gemminc/5x8-avx-broadcast.c
-  src/f32-gemminc/6x8-avx-broadcast.c
-  src/f32-gemminc/7x8-avx-broadcast.c
-  src/f32-igemm/1x8-avx-broadcast.c
-  src/f32-igemm/4x8-avx-broadcast.c
-  src/f32-igemm/5x8-avx-broadcast.c
-  src/f32-igemm/6x8-avx-broadcast.c
-  src/f32-igemm/7x8-avx-broadcast.c
+  src/f32-dwconv/gen/up16x4-avx-acc2.c
+  src/f32-dwconv/gen/up16x4-avx.c
+  src/f32-dwconv/gen/up8x4-avx-acc2.c
+  src/f32-dwconv/gen/up8x4-avx.c
+  src/f32-dwconv/gen/up16x9-avx-acc2.c
+  src/f32-dwconv/gen/up16x9-avx.c
+  src/f32-dwconv/gen/up8x9-avx-acc2.c
+  src/f32-dwconv/gen/up8x9-avx.c
+  src/f32-dwconv/gen/up16x25-avx-acc2.c
+  src/f32-dwconv/gen/up16x25-avx.c
+  src/f32-dwconv/gen/up8x25-avx-acc2.c
+  src/f32-dwconv/gen/up8x25-avx.c
+  src/f32-gemm/gen/1x8-avx-broadcast.c
+  src/f32-gemm/gen/4x8-avx-broadcast.c
+  src/f32-gemm/gen/5x8-avx-broadcast.c
+  src/f32-gemm/gen/6x8-avx-broadcast.c
+  src/f32-gemm/gen/7x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/1x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/4x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/5x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/6x8-avx-broadcast.c
+  src/f32-gemm/gen-inc/7x8-avx-broadcast.c
+  src/f32-igemm/gen/1x8-avx-broadcast.c
+  src/f32-igemm/gen/4x8-avx-broadcast.c
+  src/f32-igemm/gen/5x8-avx-broadcast.c
+  src/f32-igemm/gen/6x8-avx-broadcast.c
+  src/f32-igemm/gen/7x8-avx-broadcast.c
   src/f32-rmax/avx.c
   src/f32-vscale/avx-unroll32.c)
 
 SET(XNNPACK_FMA3_MICROKERNEL_SRCS
-  src/f32-dwconv/up16x4-fma3-acc2.c
-  src/f32-dwconv/up16x4-fma3.c
-  src/f32-dwconv/up8x4-fma3-acc2.c
-  src/f32-dwconv/up8x4-fma3.c
-  src/f32-dwconv/up16x9-fma3-acc2.c
-  src/f32-dwconv/up16x9-fma3.c
-  src/f32-dwconv/up8x9-fma3-acc2.c
-  src/f32-dwconv/up8x9-fma3.c
-  src/f32-dwconv/up16x25-fma3-acc2.c
-  src/f32-dwconv/up16x25-fma3.c
-  src/f32-dwconv/up8x25-fma3-acc2.c
-  src/f32-dwconv/up8x25-fma3.c
-  src/f32-gemm/1x8-fma3-broadcast.c
-  src/f32-gemm/4x8-fma3-broadcast.c
-  src/f32-gemm/5x8-fma3-broadcast.c
-  src/f32-gemm/6x8-fma3-broadcast.c
-  src/f32-gemm/7x8-fma3-broadcast.c
-  src/f32-gemm/8x8-fma3-broadcast.c
-  src/f32-gemminc/1x8-fma3-broadcast.c
-  src/f32-gemminc/4x8-fma3-broadcast.c
-  src/f32-gemminc/5x8-fma3-broadcast.c
-  src/f32-gemminc/6x8-fma3-broadcast.c
-  src/f32-gemminc/7x8-fma3-broadcast.c
-  src/f32-gemminc/8x8-fma3-broadcast.c
-  src/f32-igemm/1x8-fma3-broadcast.c
-  src/f32-igemm/4x8-fma3-broadcast.c
-  src/f32-igemm/5x8-fma3-broadcast.c
-  src/f32-igemm/6x8-fma3-broadcast.c
-  src/f32-igemm/7x8-fma3-broadcast.c
-  src/f32-igemm/8x8-fma3-broadcast.c)
+  src/f32-dwconv/gen/up16x4-fma3-acc2.c
+  src/f32-dwconv/gen/up16x4-fma3.c
+  src/f32-dwconv/gen/up8x4-fma3-acc2.c
+  src/f32-dwconv/gen/up8x4-fma3.c
+  src/f32-dwconv/gen/up16x9-fma3-acc2.c
+  src/f32-dwconv/gen/up16x9-fma3.c
+  src/f32-dwconv/gen/up8x9-fma3-acc2.c
+  src/f32-dwconv/gen/up8x9-fma3.c
+  src/f32-dwconv/gen/up16x25-fma3-acc2.c
+  src/f32-dwconv/gen/up16x25-fma3.c
+  src/f32-dwconv/gen/up8x25-fma3-acc2.c
+  src/f32-dwconv/gen/up8x25-fma3.c
+  src/f32-gemm/gen/1x8-fma3-broadcast.c
+  src/f32-gemm/gen/4x8-fma3-broadcast.c
+  src/f32-gemm/gen/5x8-fma3-broadcast.c
+  src/f32-gemm/gen/6x8-fma3-broadcast.c
+  src/f32-gemm/gen/7x8-fma3-broadcast.c
+  src/f32-gemm/gen/8x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/1x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/4x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/5x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/6x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/7x8-fma3-broadcast.c
+  src/f32-gemm/gen-inc/8x8-fma3-broadcast.c
+  src/f32-igemm/gen/1x8-fma3-broadcast.c
+  src/f32-igemm/gen/4x8-fma3-broadcast.c
+  src/f32-igemm/gen/5x8-fma3-broadcast.c
+  src/f32-igemm/gen/6x8-fma3-broadcast.c
+  src/f32-igemm/gen/7x8-fma3-broadcast.c
+  src/f32-igemm/gen/8x8-fma3-broadcast.c)
 
 SET(XNNPACK_AVX2_MICROKERNEL_SRCS
   src/f32-raddexpminusmax/avx2-p5-unroll64.c
@@ -757,40 +757,40 @@
 SET(XNNPACK_AARCH64_ASM_MICROKERNEL_SRCS
   src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
   src/f32-dwconv/up4x9-aarch64-neonfma.S
-  src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/4x8-aarch64-neonfma-ld128.S
-  src/f32-gemm/4x8-aarch64-neonfma-ld64.S
-  src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S
-  src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemm/6x8-aarch64-neonfma-ld128.S
-  src/f32-gemm/6x8-aarch64-neonfma-ld64.S
-  src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/1x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/4x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/4x8-aarch64-neonfma-ld128.S
-  src/f32-gemminc/4x8-aarch64-neonfma-ld64.S
-  src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S
-  src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S
-  src/f32-gemminc/6x8-aarch64-neonfma-ld128.S
-  src/f32-gemminc/6x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
+  src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
+  src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
   src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S
   src/f32-igemm/1x8-aarch64-neonfma-cortex-a53.S
   src/f32-igemm/1x8-aarch64-neonfma-cortex-a57.S
diff --git a/scripts/generate-f16-gemm.sh b/scripts/generate-f16-gemm.sh
index 0a00557..5ea5ba5 100755
--- a/scripts/generate-f16-gemm.sh
+++ b/scripts/generate-f16-gemm.sh
@@ -6,10 +6,9 @@
 
 ########################## ARM NEON with FP16 compute #########################
 ### LD64 micro-kernels
-tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=4 -D NR=8 -o src/f16-gemm/4x8-neonfp16arith-ld64.c
-tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=6 -D NR=8 -o src/f16-gemm/6x8-neonfp16arith-ld64.c
-tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=8 -D NR=8 -o src/f16-gemm/8x8-neonfp16arith-ld64.c
-
+tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=4 -D NR=8 -o src/f16-gemm/gen/4x8-neonfp16arith-ld64.c
+tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=6 -D NR=8 -o src/f16-gemm/gen/6x8-neonfp16arith-ld64.c
+tools/xngen src/f16-gemm/neonfp16arith-ld64.c.in -D MR=8 -D NR=8 -o src/f16-gemm/gen/8x8-neonfp16arith-ld64.c
 
 ################################## Unit tests #################################
 tools/generate-gemm-test.py --spec test/f16-gemm.yaml --output test/f16-gemm.cc
diff --git a/scripts/generate-f32-bilinear.sh b/scripts/generate-f32-bilinear.sh
index 769022b..75fe719 100755
--- a/scripts/generate-f32-bilinear.sh
+++ b/scripts/generate-f32-bilinear.sh
@@ -5,24 +5,24 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-bilinear/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-bilinear/scalar-c1.c
-tools/xngen src/f32-bilinear/scalar.c.in -D CHANNEL_TILE=2 -D PIXEL_TILE=1 -o src/f32-bilinear/scalar-c2.c
-tools/xngen src/f32-bilinear/scalar.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -o src/f32-bilinear/scalar-c4.c
+tools/xngen src/f32-bilinear/scalar.c.in -D CHANNEL_TILE=1 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/scalar-c1.c
+tools/xngen src/f32-bilinear/scalar.c.in -D CHANNEL_TILE=2 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/scalar-c2.c
+tools/xngen src/f32-bilinear/scalar.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/scalar-c4.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -D FMA=0 -o src/f32-bilinear/neon-c4.c
-tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -D FMA=0 -o src/f32-bilinear/neon-c8.c
+tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -D FMA=0 -o src/f32-bilinear/gen/neon-c4.c
+tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -D FMA=0 -o src/f32-bilinear/gen/neon-c8.c
 
-tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -D FMA=1 -o src/f32-bilinear/neonfma-c4.c
-tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -D FMA=1 -o src/f32-bilinear/neonfma-c8.c
+tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -D FMA=1 -o src/f32-bilinear/gen/neonfma-c4.c
+tools/xngen src/f32-bilinear/neon.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -D FMA=1 -o src/f32-bilinear/gen/neonfma-c8.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-bilinear/psimd.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -o src/f32-bilinear/psimd-c4.c
-tools/xngen src/f32-bilinear/psimd.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -o src/f32-bilinear/psimd-c8.c
+tools/xngen src/f32-bilinear/psimd.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/psimd-c4.c
+tools/xngen src/f32-bilinear/psimd.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/psimd-c8.c
 
 ################################### x86 SSE ###################################
-tools/xngen src/f32-bilinear/sse.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -o src/f32-bilinear/sse-c4.c
-tools/xngen src/f32-bilinear/sse.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -o src/f32-bilinear/sse-c8.c
+tools/xngen src/f32-bilinear/sse.c.in -D CHANNEL_TILE=4 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/sse-c4.c
+tools/xngen src/f32-bilinear/sse.c.in -D CHANNEL_TILE=8 -D PIXEL_TILE=1 -o src/f32-bilinear/gen/sse-c8.c
 
 ################################## Unit tests #################################
 tools/generate-bilinear-test.py --spec test/f32-bilinear.yaml --output test/f32-bilinear.cc
diff --git a/scripts/generate-f32-dwconv.sh b/scripts/generate-f32-dwconv.sh
index a9c34c1..2ed64c5 100755
--- a/scripts/generate-f32-dwconv.sh
+++ b/scripts/generate-f32-dwconv.sh
@@ -5,94 +5,94 @@
 # LICENSE file in the root directory of this source tree.
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-dwconv/up4x9-neon.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-dwconv/up4x9-neon-acc2.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-dwconv/up8x9-neon.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-dwconv/up8x9-neon-acc2.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-dwconv/gen/up4x9-neon.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-dwconv/gen/up4x9-neon-acc2.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=0 -o src/f32-dwconv/gen/up8x9-neon.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=0 -o src/f32-dwconv/gen/up8x9-neon-acc2.c
 
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-dwconv/up4x9-neonfma.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-dwconv/up4x9-neonfma-acc2.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-dwconv/up8x9-neonfma.c
-tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-dwconv/up8x9-neonfma-acc2.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-dwconv/gen/up4x9-neonfma.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-dwconv/gen/up4x9-neonfma-acc2.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -D FMA=1 -o src/f32-dwconv/gen/up8x9-neonfma.c
+tools/xngen src/f32-dwconv/up-neon.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -D FMA=1 -o src/f32-dwconv/gen/up8x9-neonfma-acc2.c
 
 ################################### x86 SSE ###################################
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x4-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x4-sse-acc2.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x4-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x4-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up4x4-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up4x4-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x4-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x4-sse-acc2.c
 
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x9-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x9-sse-acc2.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x9-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x9-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up4x9-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up4x9-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x9-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x9-sse-acc2.c
 
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x25-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x25-sse-acc2.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x25-sse.c
-tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x25-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up4x25-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up4x25-sse-acc2.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x25-sse.c
+tools/xngen src/f32-dwconv/up-sse.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x25-sse-acc2.c
 
 ################################### x86 AVX ###################################
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x4-avx.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x4-avx-acc2.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/up16x4-avx.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/up16x4-avx-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x4-avx.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x4-avx-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up16x4-avx.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up16x4-avx-acc2.c
 
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x9-avx.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x9-avx-acc2.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/up16x9-avx.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/up16x9-avx-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x9-avx.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x9-avx-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up16x9-avx.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up16x9-avx-acc2.c
 
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x25-avx.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x25-avx-acc2.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/up16x25-avx.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/up16x25-avx-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x25-avx.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x25-avx-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up16x25-avx.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=0 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up16x25-avx-acc2.c
 
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x4-fma3.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x4-fma3-acc2.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/up16x4-fma3.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/up16x4-fma3-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x4-fma3.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x4-fma3-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up16x4-fma3.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=4 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up16x4-fma3-acc2.c
 
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x9-fma3.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x9-fma3-acc2.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/up16x9-fma3.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/up16x9-fma3-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x9-fma3.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x9-fma3-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up16x9-fma3.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=9 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up16x9-fma3-acc2.c
 
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x25-fma3.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x25-fma3-acc2.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/up16x25-fma3.c
-tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/up16x25-fma3-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x25-fma3.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x25-fma3-acc2.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up16x25-fma3.c
+tools/xngen src/f32-dwconv/up-avx.c.in -D CHANNEL_TILE=16 -D KERNEL_TILE=25 -D FMA=3 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up16x25-fma3-acc2.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x4-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x4-psimd-acc2.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x4-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x4-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up4x4-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up4x4-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x4-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=4 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x4-psimd-acc2.c
 
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x9-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x9-psimd-acc2.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x9-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x9-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up4x9-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up4x9-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x9-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=9 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x9-psimd-acc2.c
 
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up4x25-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up4x25-psimd-acc2.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up8x25-psimd.c
-tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up8x25-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up4x25-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=4 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up4x25-psimd-acc2.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up8x25-psimd.c
+tools/xngen src/f32-dwconv/up-psimd.c.in -D CHANNEL_TILE=8 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up8x25-psimd-acc2.c
 
 #################################### Scalar ###################################
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=4  -D ACCUMULATORS=1 -o src/f32-dwconv/up1x4-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=4  -D ACCUMULATORS=2 -o src/f32-dwconv/up1x4-scalar-acc2.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=4  -D ACCUMULATORS=1 -o src/f32-dwconv/up2x4-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=4  -D ACCUMULATORS=2 -o src/f32-dwconv/up2x4-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=4  -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up1x4-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=4  -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up1x4-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=4  -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up2x4-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=4  -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up2x4-scalar-acc2.c
 
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=9  -D ACCUMULATORS=1 -o src/f32-dwconv/up1x9-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=9  -D ACCUMULATORS=2 -o src/f32-dwconv/up1x9-scalar-acc2.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=9  -D ACCUMULATORS=1 -o src/f32-dwconv/up2x9-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=9  -D ACCUMULATORS=2 -o src/f32-dwconv/up2x9-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=9  -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up1x9-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=9  -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up1x9-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=9  -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up2x9-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=9  -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up2x9-scalar-acc2.c
 
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up1x25-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up1x25-scalar-acc2.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/up2x25-scalar.c
-tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/up2x25-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up1x25-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=1 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up1x25-scalar-acc2.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=25 -D ACCUMULATORS=1 -o src/f32-dwconv/gen/up2x25-scalar.c
+tools/xngen src/f32-dwconv/up-scalar.c.in -D CHANNEL_TILE=2 -D KERNEL_TILE=25 -D ACCUMULATORS=2 -o src/f32-dwconv/gen/up2x25-scalar-acc2.c
 
 ################################## Unit tests #################################
 tools/generate-dwconv-test.py --spec test/f32-dwconv.yaml --output test/f32-dwconv.cc
diff --git a/scripts/generate-f32-gemm.sh b/scripts/generate-f32-gemm.sh
index 566a03f..388c74e 100755
--- a/scripts/generate-f32-gemm.sh
+++ b/scripts/generate-f32-gemm.sh
@@ -5,214 +5,214 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-gemm/scalar.c.in -D MR=1 -D NR=4 -D INC=0 -o src/f32-gemm/1x4-scalar.c
-tools/xngen src/f32-gemm/scalar.c.in -D MR=1 -D NR=4 -D INC=1 -o src/f32-gemminc/1x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=1 -D NR=4 -D INC=0 -o src/f32-gemm/gen/1x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=1 -D NR=4 -D INC=1 -o src/f32-gemm/gen-inc/1x4-scalar.c
 
-tools/xngen src/f32-gemm/scalar.c.in -D MR=2 -D NR=4 -D INC=0 -o src/f32-gemm/2x4-scalar.c
-tools/xngen src/f32-gemm/scalar.c.in -D MR=2 -D NR=4 -D INC=1 -o src/f32-gemminc/2x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=2 -D NR=4 -D INC=0 -o src/f32-gemm/gen/2x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=2 -D NR=4 -D INC=1 -o src/f32-gemm/gen-inc/2x4-scalar.c
 
-tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=2 -D INC=0 -o src/f32-gemm/4x2-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=2 -D INC=0 -o src/f32-gemm/gen/4x2-scalar.c
 
-tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=4 -D INC=0 -o src/f32-gemm/4x4-scalar.c
-tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=4 -D INC=1 -o src/f32-gemminc/4x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=4 -D INC=0 -o src/f32-gemm/gen/4x4-scalar.c
+tools/xngen src/f32-gemm/scalar.c.in -D MR=4 -D NR=4 -D INC=1 -o src/f32-gemm/gen-inc/4x4-scalar.c
 
 ############################### AArch64 assembly ##############################
-tools/xngen src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in -D INC=0 -o src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
-tools/xngen src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in -D INC=1 -o src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in -D INC=0 -o src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in -D INC=1 -o src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
 
-tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in  -D INC=0 -o src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S
-tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in  -D INC=1 -o src/f32-gemminc/1x8-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in  -D INC=0 -o src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S.in  -D INC=1 -o src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
 
-tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S
-tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
 
-tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S
-tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
 
-tools/xngen src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in -D INC=0 -o src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S
-tools/xngen src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in -D INC=1 -o src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in -D INC=0 -o src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S.in -D INC=1 -o src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
 
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in  -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in  -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in  -D INC=0 -o src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in  -D INC=1 -o src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
 
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
 
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
 
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in       -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-ld128.S
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in       -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-ld128.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in       -D INC=0 -o src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld128.S.in       -D INC=1 -o src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
 
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in        -D INC=0 -o src/f32-gemm/4x8-aarch64-neonfma-ld64.S
-tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in        -D INC=1 -o src/f32-gemminc/4x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in        -D INC=0 -o src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/4x8-aarch64-neonfma-ld64.S.in        -D INC=1 -o src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
 
-tools/xngen src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
-tools/xngen src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
 
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in  -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in  -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
 
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S.in  -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S.in  -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
 
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in  -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S.in  -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
 
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in  -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in  -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
 
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in        -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-ld64.S
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in        -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in        -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in        -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
 
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in       -D INC=0 -o src/f32-gemm/6x8-aarch64-neonfma-ld128.S
-tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in       -D INC=1 -o src/f32-gemminc/6x8-aarch64-neonfma-ld128.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in       -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in       -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
 
 ################################### ARM NEON ##################################
 ### LD64 micro-kernels
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/1x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/1x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/1x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/1x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/5x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/5x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/5x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/5x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/6x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/6x8-neon-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/6x8-neonfma-lane-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/6x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/1x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/1x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/4x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/4x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/5x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/5x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/6x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/6x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c
 ### LD128 micro-kernels
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neon-lane-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neon-lane-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neonfma-lane-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neonfma-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/4x8-neon-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/4x8-neonfma-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c
 ### MRx2 micro-kernels
-tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/4x2-neon-lane-ld64.c
-tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/4x2-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/4x2-neon-lane-ld64.c
+tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/gen/4x2-neonfma-lane-ld64.c
 ### DUP LD64 micro-kernels
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/1x8-neon-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemminc/1x8-neon-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/1x8-neonfma-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemminc/1x8-neonfma-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/4x8-neon-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemminc/4x8-neon-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/4x8-neonfma-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemminc/4x8-neonfma-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/6x8-neon-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemminc/6x8-neon-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/6x8-neonfma-dup-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemminc/6x8-neonfma-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/1x8-neon-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/1x8-neonfma-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/4x8-neon-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/4x8-neonfma-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/6x8-neon-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/6x8-neonfma-dup-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c
 ### DUP LD128 midupkernels
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/4x8-neon-dup-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemminc/4x8-neon-dup-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/4x8-neonfma-dup-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemminc/4x8-neonfma-dup-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/4x8-neon-dup-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=1 -o src/f32-gemm/gen/4x8-neonfma-dup-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=1 -o src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/1x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/1x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/1x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/1x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/4x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/4x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/4x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/4x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/6x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/6x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/6x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/6x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/8x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/8x8s4-neon.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/8x8s4-neonfma.c
-tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/8x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/gen/1x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/1x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/gen/1x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemm/gen-inc/1x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/gen/4x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/4x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/gen/4x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemm/gen-inc/4x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/gen/6x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/6x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/gen/6x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemm/gen-inc/6x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/gen/8x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/8x8s4-neon.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/gen/8x8s4-neonfma.c
+tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemm/gen-inc/8x8s4-neonfma.c
 
 #################################### PSIMD ####################################
 ### LOAD1+BROADCAST micro-kernels
-tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-psimd-loadsplat.c
-tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c
 
-tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-psimd-loadsplat.c
-tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/gen/4x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c
 
-tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/6x8-psimd-loadsplat.c
-tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemminc/6x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/gen/6x8-psimd-loadsplat.c
+tools/xngen src/f32-gemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c
 ### LOAD4+DUPLICATE micro-kernels
-tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-psimd-splat.c
-tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/1x8-psimd-splat.c
 
-tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-psimd-splat.c
-tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/gen/4x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/4x8-psimd-splat.c
 
-tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/6x8-psimd-splat.c
-tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemminc/6x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/gen/6x8-psimd-splat.c
+tools/xngen src/f32-gemm/psimd-splat.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/6x8-psimd-splat.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8s4-psimd.c
-tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/1x8s4-psimd.c
 
-tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8s4-psimd.c
-tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/gen/4x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/4x8s4-psimd.c
 
-tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/6x8s4-psimd.c
-tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemminc/6x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=6 -D NR=8 -D INC=0 -o src/f32-gemm/gen/6x8s4-psimd.c
+tools/xngen src/f32-gemm/psimd-s4.c.in -D MR=6 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/6x8s4-psimd.c
 
 ################################### x86 SSE ###################################
 ### LOAD1+BROADCAST micro-kernels
-tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-sse-load1.c
-tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-sse-load1.c
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8-sse-load1.c
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/1x8-sse-load1.c
 
-tools/xngen src/f32-gemm/sse-load1.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-sse-load1.c
-tools/xngen src/f32-gemm/sse-load1.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-sse-load1.c
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/gen/4x8-sse-load1.c
+tools/xngen src/f32-gemm/sse-load1.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/4x8-sse-load1.c
 ### LOAD4+DUPLICATE micro-kernels
-tools/xngen src/f32-gemm/sse-dup.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8-sse-dup.c
-tools/xngen src/f32-gemm/sse-dup.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8-sse-dup.c
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8-sse-dup.c
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/1x8-sse-dup.c
 
-tools/xngen src/f32-gemm/sse-dup.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8-sse-dup.c
-tools/xngen src/f32-gemm/sse-dup.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8-sse-dup.c
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/gen/4x8-sse-dup.c
+tools/xngen src/f32-gemm/sse-dup.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/4x8-sse-dup.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/1x8s4-sse.c
-tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemminc/1x8s4-sse.c
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=1 -D NR=8 -D INC=0 -o src/f32-gemm/gen/1x8s4-sse.c
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=1 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/1x8s4-sse.c
 
-tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/4x8s4-sse.c
-tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemminc/4x8s4-sse.c
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=4 -D NR=8 -D INC=0 -o src/f32-gemm/gen/4x8s4-sse.c
+tools/xngen src/f32-gemm/sse-shuffle.c.in -D MR=4 -D NR=8 -D INC=1 -o src/f32-gemm/gen-inc/4x8s4-sse.c
 
 ################################### x86 AVX ###################################
 ### AVX+BROADCAST micro-kernels
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/1x8-avx-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemminc/1x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/gen/1x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/1x8-avx-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/4x8-avx-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemminc/4x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/gen/4x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/4x8-avx-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/5x8-avx-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemminc/5x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/gen/5x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/5x8-avx-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/6x8-avx-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemminc/6x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/gen/6x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/6x8-avx-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/7x8-avx-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemminc/7x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=0 -D INC=0 -o src/f32-gemm/gen/7x8-avx-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=0 -D INC=1 -o src/f32-gemm/gen-inc/7x8-avx-broadcast.c
 ### FMA3+BROADCAST micro-kernels
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/1x8-fma3-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemminc/1x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/gen/1x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemm/gen-inc/1x8-fma3-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/4x8-fma3-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemminc/4x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/gen/4x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemm/gen-inc/4x8-fma3-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/5x8-fma3-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemminc/5x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/gen/5x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemm/gen-inc/5x8-fma3-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/6x8-fma3-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemminc/6x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/gen/6x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemm/gen-inc/6x8-fma3-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/7x8-fma3-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemminc/7x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/gen/7x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemm/gen-inc/7x8-fma3-broadcast.c
 
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=8 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/8x8-fma3-broadcast.c
-tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=8 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemminc/8x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=8 -D NR=8 -D FMA=3 -D INC=0 -o src/f32-gemm/gen/8x8-fma3-broadcast.c
+tools/xngen src/f32-gemm/avx-broadcast.c.in -D MR=8 -D NR=8 -D FMA=3 -D INC=1 -o src/f32-gemm/gen-inc/8x8-fma3-broadcast.c
 
 ################################## Unit tests #################################
 tools/generate-gemm-test.py --spec test/f32-gemm.yaml --output test/f32-gemm.cc
diff --git a/scripts/generate-f32-igemm.sh b/scripts/generate-f32-igemm.sh
index e90fd27..b984168 100755
--- a/scripts/generate-f32-igemm.sh
+++ b/scripts/generate-f32-igemm.sh
@@ -5,90 +5,90 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-igemm/scalar.c.in -D MR=1 -D NR=4 -o src/f32-igemm/1x4-scalar.c
-tools/xngen src/f32-igemm/scalar.c.in -D MR=2 -D NR=4 -o src/f32-igemm/2x4-scalar.c
-tools/xngen src/f32-igemm/scalar.c.in -D MR=4 -D NR=2 -o src/f32-igemm/4x2-scalar.c
-tools/xngen src/f32-igemm/scalar.c.in -D MR=4 -D NR=4 -o src/f32-igemm/4x4-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=1 -D NR=4 -o src/f32-igemm/gen/1x4-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=2 -D NR=4 -o src/f32-igemm/gen/2x4-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=4 -D NR=2 -o src/f32-igemm/gen/4x2-scalar.c
+tools/xngen src/f32-igemm/scalar.c.in -D MR=4 -D NR=4 -o src/f32-igemm/gen/4x4-scalar.c
 
 ################################### ARM NEON ##################################
 ### LD64 micro-kernels
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/1x8-neon-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/1x8-neonfma-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=4 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x4-neon-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=4 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x4-neonfma-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x8-neon-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x8-neonfma-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/6x8-neon-lane-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/6x8-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/gen/1x8-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/gen/1x8-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=4 -D FMA=0 -D DUP=0 -o src/f32-igemm/gen/4x4-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=4 -D FMA=1 -D DUP=0 -o src/f32-igemm/gen/4x4-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/gen/4x8-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/gen/4x8-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/gen/6x8-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/gen/6x8-neonfma-lane-ld64.c
 ### LD128 micro-kernels
-tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x8-neon-lane-ld128.c
-tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x8-neonfma-lane-ld128.c
+tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/gen/4x8-neon-lane-ld128.c
+tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/gen/4x8-neonfma-lane-ld128.c
 ### MRx2 micro-kernels-
-tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x2-neon-lane-ld64.c
-tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x2-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=0 -D DUP=0 -o src/f32-igemm/gen/4x2-neon-lane-ld64.c
+tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=1 -D DUP=0 -o src/f32-igemm/gen/4x2-neonfma-lane-ld64.c
 ### DUP LD64 micro-kernels
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/1x8-neon-dup-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/1x8-neonfma-dup-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/4x8-neon-dup-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/4x8-neonfma-dup-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/6x8-neon-dup-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/6x8-neonfma-dup-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/gen/1x8-neon-dup-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/gen/1x8-neonfma-dup-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/gen/4x8-neon-dup-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/gen/4x8-neonfma-dup-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/gen/6x8-neon-dup-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/gen/6x8-neonfma-dup-ld64.c
 ### DUP LD128 micro-kernels
-tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/4x8-neon-dup-ld128.c
-tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/4x8-neonfma-dup-ld128.c
+tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=0 -D DUP=1 -o src/f32-igemm/gen/4x8-neon-dup-ld128.c
+tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=1 -D DUP=1 -o src/f32-igemm/gen/4x8-neonfma-dup-ld128.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8 -D FMA=0 -o src/f32-igemm/1x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8 -D FMA=1 -o src/f32-igemm/1x8s4-neonfma.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/4x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8 -D FMA=1 -o src/f32-igemm/4x8s4-neonfma.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8 -D FMA=0 -o src/f32-igemm/6x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8 -D FMA=1 -o src/f32-igemm/6x8s4-neonfma.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8 -D FMA=0 -o src/f32-igemm/8x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8 -D FMA=1 -o src/f32-igemm/8x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/1x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8 -D FMA=1 -o src/f32-igemm/gen/1x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/4x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8 -D FMA=1 -o src/f32-igemm/gen/4x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/6x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8 -D FMA=1 -o src/f32-igemm/gen/6x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/8x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8 -D FMA=1 -o src/f32-igemm/gen/8x8s4-neonfma.c
 
 #################################### PSIMD ####################################
 ### LOAD1+BROADCAST micro-kernels
-tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-psimd-loadsplat.c
-tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-psimd-loadsplat.c
-tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -o src/f32-igemm/6x8-psimd-loadsplat.c
+tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=1 -D NR=8 -o src/f32-igemm/gen/1x8-psimd-loadsplat.c
+tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=4 -D NR=8 -o src/f32-igemm/gen/4x8-psimd-loadsplat.c
+tools/xngen src/f32-igemm/psimd-loadsplat.c.in -D MR=6 -D NR=8 -o src/f32-igemm/gen/6x8-psimd-loadsplat.c
 ### LOAD4+DUPLICATE micro-kernels
-tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-psimd-splat.c
-tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-psimd-splat.c
-tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=6 -D NR=8 -o src/f32-igemm/6x8-psimd-splat.c
+tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=1 -D NR=8 -o src/f32-igemm/gen/1x8-psimd-splat.c
+tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=4 -D NR=8 -o src/f32-igemm/gen/4x8-psimd-splat.c
+tools/xngen src/f32-igemm/psimd-splat.c.in -D MR=6 -D NR=8 -o src/f32-igemm/gen/6x8-psimd-splat.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8s4-psimd.c
-tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8s4-psimd.c
-tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=6 -D NR=8 -o src/f32-igemm/6x8s4-psimd.c
+tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=1 -D NR=8 -o src/f32-igemm/gen/1x8s4-psimd.c
+tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=4 -D NR=8 -o src/f32-igemm/gen/4x8s4-psimd.c
+tools/xngen src/f32-igemm/psimd-s4.c.in -D MR=6 -D NR=8 -o src/f32-igemm/gen/6x8s4-psimd.c
 ### MRx2 micro-kernels
-tools/xngen src/f32-igemm/MRx2c4-psimd.c.in -D MR=4 -D NR=2 -o src/f32-igemm/4x2c4-psimd.c
+tools/xngen src/f32-igemm/MRx2c4-psimd.c.in -D MR=4 -D NR=2 -o src/f32-igemm/gen/4x2c4-psimd.c
 
 ################################### x86 SSE ###################################
 ### LOAD1+BROADCAST micro-kernels
-tools/xngen src/f32-igemm/sse-load1.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-sse-load1.c
-tools/xngen src/f32-igemm/sse-load1.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-sse-load1.c
+tools/xngen src/f32-igemm/sse-load1.c.in -D MR=1 -D NR=8 -o src/f32-igemm/gen/1x8-sse-load1.c
+tools/xngen src/f32-igemm/sse-load1.c.in -D MR=4 -D NR=8 -o src/f32-igemm/gen/4x8-sse-load1.c
 ### LOAD4+DUPLICATE micro-kernels
-tools/xngen src/f32-igemm/sse-dup.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8-sse-dup.c
-tools/xngen src/f32-igemm/sse-dup.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8-sse-dup.c
+tools/xngen src/f32-igemm/sse-dup.c.in -D MR=1 -D NR=8 -o src/f32-igemm/gen/1x8-sse-dup.c
+tools/xngen src/f32-igemm/sse-dup.c.in -D MR=4 -D NR=8 -o src/f32-igemm/gen/4x8-sse-dup.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-igemm/sse-shuffle.c.in -D MR=1 -D NR=8 -o src/f32-igemm/1x8s4-sse.c
-tools/xngen src/f32-igemm/sse-shuffle.c.in -D MR=4 -D NR=8 -o src/f32-igemm/4x8s4-sse.c
+tools/xngen src/f32-igemm/sse-shuffle.c.in -D MR=1 -D NR=8 -o src/f32-igemm/gen/1x8s4-sse.c
+tools/xngen src/f32-igemm/sse-shuffle.c.in -D MR=4 -D NR=8 -o src/f32-igemm/gen/4x8s4-sse.c
 ### MRx2 micro-kernels
-tools/xngen src/f32-igemm/MRx2c4-sse.c.in -D MR=4 -D NR=2 -o src/f32-igemm/4x2c4-sse.c
+tools/xngen src/f32-igemm/MRx2c4-sse.c.in -D MR=4 -D NR=2 -o src/f32-igemm/gen/4x2c4-sse.c
 
 ################################### x86 AVX ###################################
 ### AVX+BROADCAST micro-kernels
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=0 -o src/f32-igemm/1x8-avx-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/4x8-avx-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=0 -o src/f32-igemm/5x8-avx-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=0 -o src/f32-igemm/6x8-avx-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=0 -o src/f32-igemm/7x8-avx-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/1x8-avx-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/4x8-avx-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/5x8-avx-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/6x8-avx-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=0 -o src/f32-igemm/gen/7x8-avx-broadcast.c
 ### FMA3+BROADCAST micro-kernels
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=3 -o src/f32-igemm/1x8-fma3-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=3 -o src/f32-igemm/4x8-fma3-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=3 -o src/f32-igemm/5x8-fma3-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=3 -o src/f32-igemm/6x8-fma3-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=3 -o src/f32-igemm/7x8-fma3-broadcast.c
-tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=8 -D NR=8 -D FMA=3 -o src/f32-igemm/8x8-fma3-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=1 -D NR=8 -D FMA=3 -o src/f32-igemm/gen/1x8-fma3-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=4 -D NR=8 -D FMA=3 -o src/f32-igemm/gen/4x8-fma3-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=5 -D NR=8 -D FMA=3 -o src/f32-igemm/gen/5x8-fma3-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=6 -D NR=8 -D FMA=3 -o src/f32-igemm/gen/6x8-fma3-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=7 -D NR=8 -D FMA=3 -o src/f32-igemm/gen/7x8-fma3-broadcast.c
+tools/xngen src/f32-igemm/avx-broadcast.c.in -D MR=8 -D NR=8 -D FMA=3 -o src/f32-igemm/gen/8x8-fma3-broadcast.c
 
 ################################## Unit tests #################################
 tools/generate-gemm-test.py --spec test/f32-igemm.yaml --output test/f32-igemm.cc
diff --git a/scripts/generate-f32-ppmm.sh b/scripts/generate-f32-ppmm.sh
index f915e81..a3c51b3 100755
--- a/scripts/generate-f32-ppmm.sh
+++ b/scripts/generate-f32-ppmm.sh
@@ -5,23 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-ppmm/scalar.c.in -D MR=4 -D NR=4 -o src/f32-ppmm/4x4-scalar.c
-tools/xngen src/f32-ppmm/scalar.c.in -D MR=2 -D NR=4 -o src/f32-ppmm/2x4-scalar.c
-tools/xngen src/f32-ppmm/scalar.c.in -D MR=4 -D NR=2 -o src/f32-ppmm/4x2-scalar.c
-tools/xngen src/f32-ppmm/scalar.c.in -D MR=3 -D NR=3 -o src/f32-ppmm/3x3-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=4 -D NR=4 -o src/f32-ppmm/gen/4x4-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=2 -D NR=4 -o src/f32-ppmm/gen/2x4-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=4 -D NR=2 -o src/f32-ppmm/gen/4x2-scalar.c
+tools/xngen src/f32-ppmm/scalar.c.in -D MR=3 -D NR=3 -o src/f32-ppmm/gen/3x3-scalar.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-ppmm/neon.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-ppmm/4x8-neon.c
-tools/xngen src/f32-ppmm/neon.c.in -D MR=4 -D NR=8 -D FMA=1 -o src/f32-ppmm/4x8-neonfma.c
-tools/xngen src/f32-ppmm/neon.c.in -D MR=8 -D NR=8 -D FMA=0 -o src/f32-ppmm/8x8-neon.c
-tools/xngen src/f32-ppmm/neon.c.in -D MR=8 -D NR=8 -D FMA=1 -o src/f32-ppmm/8x8-neonfma.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-ppmm/gen/4x8-neon.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=4 -D NR=8 -D FMA=1 -o src/f32-ppmm/gen/4x8-neonfma.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=8 -D NR=8 -D FMA=0 -o src/f32-ppmm/gen/8x8-neon.c
+tools/xngen src/f32-ppmm/neon.c.in -D MR=8 -D NR=8 -D FMA=1 -o src/f32-ppmm/gen/8x8-neonfma.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-ppmm/psimd.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/4x8-psimd.c
+tools/xngen src/f32-ppmm/psimd.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/gen/4x8-psimd.c
 
 ################################### x86 SSE ###################################
-tools/xngen src/f32-ppmm/sse.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/4x8-sse.c
-
+tools/xngen src/f32-ppmm/sse.c.in -D MR=4 -D NR=8 -o src/f32-ppmm/gen/4x8-sse.c
 
 ################################## Unit tests #################################
 tools/generate-gemm-test.py --spec test/f32-ppmm.yaml --output test/f32-ppmm.cc
diff --git a/scripts/generate-f32-prelu.sh b/scripts/generate-f32-prelu.sh
index 900152c..37e413d 100755
--- a/scripts/generate-f32-prelu.sh
+++ b/scripts/generate-f32-prelu.sh
@@ -5,22 +5,22 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-prelu/scalar-2x1.c
-tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/scalar-2x4.c
+tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-prelu/gen/scalar-2x1.c
+tools/xngen src/f32-prelu/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/scalar-2x4.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/neon-2x4.c
-tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/neon-2x8.c
+tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/neon-2x4.c
+tools/xngen src/f32-prelu/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/gen/neon-2x8.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-prelu/psimd.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/psimd-2x4.c
-tools/xngen src/f32-prelu/psimd.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/psimd-2x8.c
+tools/xngen src/f32-prelu/psimd.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-prelu/gen/psimd-2x4.c
+tools/xngen src/f32-prelu/psimd.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-prelu/gen/psimd-2x8.c
 
 ################################### x86 SSE2 ###################################
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D BLEND=0 -o src/f32-prelu/sse2-2x4.c
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D BLEND=0 -o src/f32-prelu/sse2-2x8.c
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D BLEND=1 -o src/f32-prelu/sse41-2x4.c
-tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D BLEND=1 -o src/f32-prelu/sse41-2x8.c
+tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D BLEND=0 -o src/f32-prelu/gen/sse2-2x4.c
+tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D BLEND=0 -o src/f32-prelu/gen/sse2-2x8.c
+tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D BLEND=1 -o src/f32-prelu/gen/sse41-2x4.c
+tools/xngen src/f32-prelu/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D BLEND=1 -o src/f32-prelu/gen/sse41-2x8.c
 
 ################################## Unit tests #################################
 tools/generate-prelu-test.py --spec test/f32-prelu.yaml --output test/f32-prelu.cc
diff --git a/scripts/generate-f32-sigmoid.sh b/scripts/generate-f32-sigmoid.sh
index 99a94ce..440239e 100755
--- a/scripts/generate-f32-sigmoid.sh
+++ b/scripts/generate-f32-sigmoid.sh
@@ -5,12 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-sigmoid/neonfma-p5-nr2fma.c.in -D BATCH_TILE=16 -o src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
-tools/xngen src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in -D BATCH_TILE=16 -o src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
-tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=8 -D BLEND=0 -o src/f32-sigmoid/sse2-p5-div-x8.c
-tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=16 -D BLEND=0 -o src/f32-sigmoid/sse2-p5-div-x16.c
-tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=8 -D BLEND=1 -o src/f32-sigmoid/sse41-p5-div-x8.c
-tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=16 -D BLEND=1 -o src/f32-sigmoid/sse41-p5-div-x16.c
+tools/xngen src/f32-sigmoid/neonfma-p5-nr2fma.c.in -D BATCH_TILE=16 -o src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
+tools/xngen src/f32-sigmoid/neon-frac-p9-p10-nr1recps.c.in -D BATCH_TILE=16 -o src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
+
+################################### x86 SSE ###################################
+tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=8 -D BLEND=0 -o src/f32-sigmoid/gen/sse2-p5-div-x8.c
+tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=16 -D BLEND=0 -o src/f32-sigmoid/gen/sse2-p5-div-x16.c
+tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=8 -D BLEND=1 -o src/f32-sigmoid/gen/sse41-p5-div-x8.c
+tools/xngen src/f32-sigmoid/sse-p5-div.c.in -D BATCH_TILE=16 -D BLEND=1 -o src/f32-sigmoid/gen/sse41-p5-div-x16.c
 
 ################################## Unit tests #################################
 tools/generate-vunary-test.py --spec test/f32-sigmoid.yaml --output test/f32-sigmoid.cc
diff --git a/scripts/generate-f32-spmm.sh b/scripts/generate-f32-spmm.sh
index 1d68d0e..466f968 100755
--- a/scripts/generate-f32-spmm.sh
+++ b/scripts/generate-f32-spmm.sh
@@ -6,47 +6,46 @@
 
 #################################### Scalar ###################################
 ### Microkernels without unrolling
-tools/xngen src/f32-spmm/scalar.c.in -D MR=1 -D NR=1 -D UNROLL=1 -o src/f32-spmm/1x1-scalar.c
-tools/xngen src/f32-spmm/scalar.c.in -D MR=2 -D NR=1 -D UNROLL=1 -o src/f32-spmm/2x1-scalar.c
-tools/xngen src/f32-spmm/scalar.c.in -D MR=4 -D NR=1 -D UNROLL=1 -o src/f32-spmm/4x1-scalar.c
-tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=1 -D UNROLL=1 -o src/f32-spmm/8x1-scalar.c
-tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=2 -D UNROLL=1 -o src/f32-spmm/8x2-scalar.c
-tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=4 -D UNROLL=1 -o src/f32-spmm/8x4-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=1 -D NR=1 -D UNROLL=1 -o src/f32-spmm/gen/1x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=2 -D NR=1 -D UNROLL=1 -o src/f32-spmm/gen/2x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=4 -D NR=1 -D UNROLL=1 -o src/f32-spmm/gen/4x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=1 -D UNROLL=1 -o src/f32-spmm/gen/8x1-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=2 -D UNROLL=1 -o src/f32-spmm/gen/8x2-scalar.c
+tools/xngen src/f32-spmm/scalar.c.in -D MR=8 -D NR=4 -D UNROLL=1 -o src/f32-spmm/gen/8x4-scalar.c
 ### Microkernels with software pipelining
-tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=1 -D NR=1 -o src/f32-spmm/1x1-scalar-pipelined.c
-tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=2 -D NR=1 -o src/f32-spmm/2x1-scalar-pipelined.c
-tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=4 -D NR=1 -o src/f32-spmm/4x1-scalar-pipelined.c
-tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=8 -D NR=1 -o src/f32-spmm/8x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=1 -D NR=1 -o src/f32-spmm/gen/1x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=2 -D NR=1 -o src/f32-spmm/gen/2x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=4 -D NR=1 -o src/f32-spmm/gen/4x1-scalar-pipelined.c
+tools/xngen src/f32-spmm/scalar-pipelined.c.in -D MR=8 -D NR=1 -o src/f32-spmm/gen/8x1-scalar-pipelined.c
 
 ################################### ARM NEON ##################################
 ### Microkernels without unrolling
-tools/xngen src/f32-spmm/neon.c.in -D MR=4 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/4x1-neonfma.c
-tools/xngen src/f32-spmm/neon.c.in -D MR=8 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/8x1-neonfma.c
-tools/xngen src/f32-spmm/neon.c.in -D MR=12 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/12x1-neonfma.c
-tools/xngen src/f32-spmm/neon.c.in -D MR=16 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/16x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=4 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/4x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=8 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/8x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=12 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/12x1-neonfma.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=16 -D NR=1 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/16x1-neonfma.c
 ### Microkernels with 2X unrolling
-tools/xngen src/f32-spmm/neon.c.in -D MR=4  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/4x1-neonfma-unroll2.c
-tools/xngen src/f32-spmm/neon.c.in -D MR=8  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/8x1-neonfma-unroll2.c
-tools/xngen src/f32-spmm/neon.c.in -D MR=16  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/16x1-neonfma-unroll2.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=4  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/gen/4x1-neonfma-unroll2.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=8  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/gen/8x1-neonfma-unroll2.c
+tools/xngen src/f32-spmm/neon.c.in -D MR=16  -D NR=1 -D UNROLL=2 -D FMA=1 -o src/f32-spmm/gen/16x1-neonfma-unroll2.c
 ### Microkernels for blocks of several output channels
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=4 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/4x2-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=8 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/8x2-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=12 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/12x2-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=16 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/16x2-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=4 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/4x4-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=8 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/8x4-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=12 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/12x4-neonfma.c
-tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=16 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/16x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=4 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/4x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=8 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/8x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=12 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/12x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=16 -D NR=2 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/16x2-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=4 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/4x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=8 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/8x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=12 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/12x4-neonfma.c
+tools/xngen src/f32-spmm/neon-blocked.c.in -D MR=16 -D NR=4 -D UNROLL=1 -D FMA=1 -o src/f32-spmm/gen/16x4-neonfma.c
 ### Microkernels with software pipelining
-tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=4 -D NR=1 -D FMA=1 -o src/f32-spmm/4x1-neonfma-pipelined.c
-tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=8 -D NR=1 -D FMA=1 -o src/f32-spmm/8x1-neonfma-pipelined.c
-tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=16 -D NR=1 -D FMA=1 -o src/f32-spmm/16x1-neonfma-pipelined.c
+tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=4 -D NR=1 -D FMA=1 -o src/f32-spmm/gen/4x1-neonfma-pipelined.c
+tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=8 -D NR=1 -D FMA=1 -o src/f32-spmm/gen/8x1-neonfma-pipelined.c
+tools/xngen src/f32-spmm/neon-pipelined.c.in -D MR=16 -D NR=1 -D FMA=1 -o src/f32-spmm/gen/16x1-neonfma-pipelined.c
 
 ################################### x86 SSE ###################################
 ### Microkernels without unrolling
-tools/xngen src/f32-spmm/sse.c.in -D MR=4 -D NR=1 -D UNROLL=1 -o src/f32-spmm/4x1-sse.c
-tools/xngen src/f32-spmm/sse.c.in -D MR=8 -D NR=1 -D UNROLL=1 -o src/f32-spmm/8x1-sse.c
-
+tools/xngen src/f32-spmm/sse.c.in -D MR=4 -D NR=1 -D UNROLL=1 -o src/f32-spmm/gen/4x1-sse.c
+tools/xngen src/f32-spmm/sse.c.in -D MR=8 -D NR=1 -D UNROLL=1 -o src/f32-spmm/gen/8x1-sse.c
 
 ################################## Unit tests #################################
 tools/generate-spmm-test.py --spec test/f32-spmm.yaml --output test/f32-spmm.cc
diff --git a/scripts/generate-f32-vbinary.sh b/scripts/generate-f32-vbinary.sh
index cb5eb27..d1ac277 100755
--- a/scripts/generate-f32-vbinary.sh
+++ b/scripts/generate-f32-vbinary.sh
@@ -5,79 +5,79 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -o src/f32-binop/vadd-scalar-x1.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -o src/f32-binop/vadd-scalar-x2.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-binop/vadd-scalar-x4.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -o src/f32-binop/vmul-scalar-x1.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -o src/f32-binop/vmul-scalar-x2.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-binop/vmul-scalar-x4.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -o src/f32-binop/vsub-scalar-x1.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -o src/f32-binop/vsub-scalar-x2.c
-tools/xngen src/f32-binop/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-binop/vsub-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -o src/f32-vbinary/gen/vadd-scalar-x1.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -o src/f32-vbinary/gen/vadd-scalar-x2.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-vbinary/gen/vadd-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -o src/f32-vbinary/gen/vmul-scalar-x1.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -o src/f32-vbinary/gen/vmul-scalar-x2.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmul-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -o src/f32-vbinary/gen/vsub-scalar-x1.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -o src/f32-vbinary/gen/vsub-scalar-x2.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsub-scalar-x4.c
 
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -o src/f32-binop/vaddc-scalar-x1.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -o src/f32-binop/vaddc-scalar-x2.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-binop/vaddc-scalar-x4.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -o src/f32-binop/vmulc-scalar-x1.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -o src/f32-binop/vmulc-scalar-x2.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-binop/vmulc-scalar-x4.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -o src/f32-binop/vsubc-scalar-x1.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -o src/f32-binop/vsubc-scalar-x2.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-binop/vsubc-scalar-x4.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -o src/f32-binop/vrsubc-scalar-x1.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -o src/f32-binop/vrsubc-scalar-x2.c
-tools/xngen src/f32-binop/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-binop/vrsubc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -o src/f32-vbinary/gen/vaddc-scalar-x1.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -o src/f32-vbinary/gen/vaddc-scalar-x2.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-vbinary/gen/vaddc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -o src/f32-vbinary/gen/vmulc-scalar-x1.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -o src/f32-vbinary/gen/vmulc-scalar-x2.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmulc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -o src/f32-vbinary/gen/vsubc-scalar-x1.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -o src/f32-vbinary/gen/vsubc-scalar-x2.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsubc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -o src/f32-vbinary/gen/vrsubc-scalar-x1.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -o src/f32-vbinary/gen/vrsubc-scalar-x2.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vrsubc-scalar-x4.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-binop/vop-neon.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-binop/vadd-neon-x4.c
-tools/xngen src/f32-binop/vop-neon.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f32-binop/vadd-neon-x8.c
-tools/xngen src/f32-binop/vop-neon.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-binop/vmul-neon-x4.c
-tools/xngen src/f32-binop/vop-neon.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f32-binop/vmul-neon-x8.c
-tools/xngen src/f32-binop/vop-neon.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-binop/vsub-neon-x4.c
-tools/xngen src/f32-binop/vop-neon.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f32-binop/vsub-neon-x8.c
+tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-vbinary/gen/vadd-neon-x4.c
+tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f32-vbinary/gen/vadd-neon-x8.c
+tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmul-neon-x4.c
+tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f32-vbinary/gen/vmul-neon-x8.c
+tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsub-neon-x4.c
+tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f32-vbinary/gen/vsub-neon-x8.c
 
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=ADD  -D BATCH_TILE=4 -o src/f32-binop/vaddc-neon-x4.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=ADD  -D BATCH_TILE=8 -o src/f32-binop/vaddc-neon-x8.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=MUL  -D BATCH_TILE=4 -o src/f32-binop/vmulc-neon-x4.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=MUL  -D BATCH_TILE=8 -o src/f32-binop/vmulc-neon-x8.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=SUB  -D BATCH_TILE=4 -o src/f32-binop/vsubc-neon-x4.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=SUB  -D BATCH_TILE=8 -o src/f32-binop/vsubc-neon-x8.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-binop/vrsubc-neon-x4.c
-tools/xngen src/f32-binop/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-binop/vrsubc-neon-x8.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=ADD  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vaddc-neon-x4.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=ADD  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vaddc-neon-x8.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MUL  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmulc-neon-x4.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=MUL  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vmulc-neon-x8.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SUB  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsubc-neon-x4.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=SUB  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vsubc-neon-x8.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vrsubc-neon-x4.c
+tools/xngen src/f32-vbinary/vopc-neon.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-vbinary/gen/vrsubc-neon-x8.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-binop/vop-psimd.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-binop/vadd-psimd-x4.c
-tools/xngen src/f32-binop/vop-psimd.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f32-binop/vadd-psimd-x8.c
-tools/xngen src/f32-binop/vop-psimd.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-binop/vmul-psimd-x4.c
-tools/xngen src/f32-binop/vop-psimd.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f32-binop/vmul-psimd-x8.c
-tools/xngen src/f32-binop/vop-psimd.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-binop/vsub-psimd-x4.c
-tools/xngen src/f32-binop/vop-psimd.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f32-binop/vsub-psimd-x8.c
+tools/xngen src/f32-vbinary/vop-psimd.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-vbinary/gen/vadd-psimd-x4.c
+tools/xngen src/f32-vbinary/vop-psimd.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f32-vbinary/gen/vadd-psimd-x8.c
+tools/xngen src/f32-vbinary/vop-psimd.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmul-psimd-x4.c
+tools/xngen src/f32-vbinary/vop-psimd.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f32-vbinary/gen/vmul-psimd-x8.c
+tools/xngen src/f32-vbinary/vop-psimd.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsub-psimd-x4.c
+tools/xngen src/f32-vbinary/vop-psimd.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f32-vbinary/gen/vsub-psimd-x8.c
 
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=ADD  -D BATCH_TILE=4 -o src/f32-binop/vaddc-psimd-x4.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=ADD  -D BATCH_TILE=8 -o src/f32-binop/vaddc-psimd-x8.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=MUL  -D BATCH_TILE=4 -o src/f32-binop/vmulc-psimd-x4.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=MUL  -D BATCH_TILE=8 -o src/f32-binop/vmulc-psimd-x8.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=SUB  -D BATCH_TILE=4 -o src/f32-binop/vsubc-psimd-x4.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=SUB  -D BATCH_TILE=8 -o src/f32-binop/vsubc-psimd-x8.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-binop/vrsubc-psimd-x4.c
-tools/xngen src/f32-binop/vopc-psimd.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-binop/vrsubc-psimd-x8.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=ADD  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vaddc-psimd-x4.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=ADD  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vaddc-psimd-x8.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=MUL  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmulc-psimd-x4.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=MUL  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vmulc-psimd-x8.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=SUB  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsubc-psimd-x4.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=SUB  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vsubc-psimd-x8.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vrsubc-psimd-x4.c
+tools/xngen src/f32-vbinary/vopc-psimd.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-vbinary/gen/vrsubc-psimd-x8.c
 
 ################################### x86 SSE ###################################
-tools/xngen src/f32-binop/vop-sse.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-binop/vadd-sse-x4.c
-tools/xngen src/f32-binop/vop-sse.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f32-binop/vadd-sse-x8.c
-tools/xngen src/f32-binop/vop-sse.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-binop/vmul-sse-x4.c
-tools/xngen src/f32-binop/vop-sse.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f32-binop/vmul-sse-x8.c
-tools/xngen src/f32-binop/vop-sse.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-binop/vsub-sse-x4.c
-tools/xngen src/f32-binop/vop-sse.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f32-binop/vsub-sse-x8.c
+tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=ADD -D BATCH_TILE=4 -o src/f32-vbinary/gen/vadd-sse-x4.c
+tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=ADD -D BATCH_TILE=8 -o src/f32-vbinary/gen/vadd-sse-x8.c
+tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MUL -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmul-sse-x4.c
+tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=MUL -D BATCH_TILE=8 -o src/f32-vbinary/gen/vmul-sse-x8.c
+tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsub-sse-x4.c
+tools/xngen src/f32-vbinary/vop-sse.c.in -D OP=SUB -D BATCH_TILE=8 -o src/f32-vbinary/gen/vsub-sse-x8.c
 
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=ADD  -D BATCH_TILE=4 -o src/f32-binop/vaddc-sse-x4.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=ADD  -D BATCH_TILE=8 -o src/f32-binop/vaddc-sse-x8.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=MUL  -D BATCH_TILE=4 -o src/f32-binop/vmulc-sse-x4.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=MUL  -D BATCH_TILE=8 -o src/f32-binop/vmulc-sse-x8.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=SUB  -D BATCH_TILE=4 -o src/f32-binop/vsubc-sse-x4.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=SUB  -D BATCH_TILE=8 -o src/f32-binop/vsubc-sse-x8.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-binop/vrsubc-sse-x4.c
-tools/xngen src/f32-binop/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-binop/vrsubc-sse-x8.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=ADD  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vaddc-sse-x4.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=ADD  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vaddc-sse-x8.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MUL  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vmulc-sse-x4.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=MUL  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vmulc-sse-x8.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SUB  -D BATCH_TILE=4 -o src/f32-vbinary/gen/vsubc-sse-x4.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=SUB  -D BATCH_TILE=8 -o src/f32-vbinary/gen/vsubc-sse-x8.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=4 -o src/f32-vbinary/gen/vrsubc-sse-x4.c
+tools/xngen src/f32-vbinary/vopc-sse.c.in -D OP=RSUB -D BATCH_TILE=8 -o src/f32-vbinary/gen/vrsubc-sse-x8.c
 
 ################################## Unit tests #################################
 tools/generate-vbinary-test.py --spec test/f32-vadd.yaml --output test/f32-vadd.cc
diff --git a/scripts/generate-f32-vmulcaddc.sh b/scripts/generate-f32-vmulcaddc.sh
index 36cc511..f438fc6 100755
--- a/scripts/generate-f32-vmulcaddc.sh
+++ b/scripts/generate-f32-vmulcaddc.sh
@@ -5,25 +5,24 @@
 # LICENSE file in the root directory of this source tree.
 
 #################################### Scalar ###################################
-tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-vmulcaddc/c1-scalar-2x.c
-tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=2 -D ROW_TILE=2 -o src/f32-vmulcaddc/c2-scalar-2x.c
-tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-vmulcaddc/c4-scalar-2x.c
+tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=1 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c1-scalar-2x.c
+tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=2 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c2-scalar-2x.c
+tools/xngen src/f32-vmulcaddc/scalar.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c4-scalar-2x.c
 
 ################################### ARM NEON ##################################
-tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D FMA=0 -o src/f32-vmulcaddc/c4-neon-2x.c
-tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D FMA=0 -o src/f32-vmulcaddc/c8-neon-2x.c
+tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D FMA=0 -o src/f32-vmulcaddc/gen/c4-neon-2x.c
+tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D FMA=0 -o src/f32-vmulcaddc/gen/c8-neon-2x.c
 
-tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D FMA=1 -o src/f32-vmulcaddc/c4-neonfma-2x.c
-tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D FMA=1 -o src/f32-vmulcaddc/c8-neonfma-2x.c
+tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -D FMA=1 -o src/f32-vmulcaddc/gen/c4-neonfma-2x.c
+tools/xngen src/f32-vmulcaddc/neon.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -D FMA=1 -o src/f32-vmulcaddc/gen/c8-neonfma-2x.c
 
 #################################### PSIMD ####################################
-tools/xngen src/f32-vmulcaddc/psimd.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-vmulcaddc/c4-psimd-2x.c
-tools/xngen src/f32-vmulcaddc/psimd.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-vmulcaddc/c8-psimd-2x.c
+tools/xngen src/f32-vmulcaddc/psimd.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c4-psimd-2x.c
+tools/xngen src/f32-vmulcaddc/psimd.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c8-psimd-2x.c
 
 ################################### x86 SSE ###################################
-tools/xngen src/f32-vmulcaddc/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-vmulcaddc/c4-sse-2x.c
-tools/xngen src/f32-vmulcaddc/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-vmulcaddc/c8-sse-2x.c
-
+tools/xngen src/f32-vmulcaddc/sse.c.in -D CHANNEL_TILE=4 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c4-sse-2x.c
+tools/xngen src/f32-vmulcaddc/sse.c.in -D CHANNEL_TILE=8 -D ROW_TILE=2 -o src/f32-vmulcaddc/gen/c8-sse-2x.c
 
 ################################## Unit tests #################################
 tools/generate-vmulcaddc-test.py --spec test/f32-vmulcaddc.yaml --output test/f32-vmulcaddc.cc
diff --git a/src/f16-gemm/4x8-neonfp16arith-ld64.c b/src/f16-gemm/gen/4x8-neonfp16arith-ld64.c
similarity index 100%
rename from src/f16-gemm/4x8-neonfp16arith-ld64.c
rename to src/f16-gemm/gen/4x8-neonfp16arith-ld64.c
diff --git a/src/f16-gemm/6x8-neonfp16arith-ld64.c b/src/f16-gemm/gen/6x8-neonfp16arith-ld64.c
similarity index 100%
rename from src/f16-gemm/6x8-neonfp16arith-ld64.c
rename to src/f16-gemm/gen/6x8-neonfp16arith-ld64.c
diff --git a/src/f16-gemm/8x8-neonfp16arith-ld64.c b/src/f16-gemm/gen/8x8-neonfp16arith-ld64.c
similarity index 100%
rename from src/f16-gemm/8x8-neonfp16arith-ld64.c
rename to src/f16-gemm/gen/8x8-neonfp16arith-ld64.c
diff --git a/src/f32-bilinear/neon-c4.c b/src/f32-bilinear/gen/neon-c4.c
similarity index 100%
rename from src/f32-bilinear/neon-c4.c
rename to src/f32-bilinear/gen/neon-c4.c
diff --git a/src/f32-bilinear/neon-c8.c b/src/f32-bilinear/gen/neon-c8.c
similarity index 100%
rename from src/f32-bilinear/neon-c8.c
rename to src/f32-bilinear/gen/neon-c8.c
diff --git a/src/f32-bilinear/neonfma-c4.c b/src/f32-bilinear/gen/neonfma-c4.c
similarity index 100%
rename from src/f32-bilinear/neonfma-c4.c
rename to src/f32-bilinear/gen/neonfma-c4.c
diff --git a/src/f32-bilinear/neonfma-c8.c b/src/f32-bilinear/gen/neonfma-c8.c
similarity index 100%
rename from src/f32-bilinear/neonfma-c8.c
rename to src/f32-bilinear/gen/neonfma-c8.c
diff --git a/src/f32-bilinear/psimd-c4.c b/src/f32-bilinear/gen/psimd-c4.c
similarity index 100%
rename from src/f32-bilinear/psimd-c4.c
rename to src/f32-bilinear/gen/psimd-c4.c
diff --git a/src/f32-bilinear/psimd-c8.c b/src/f32-bilinear/gen/psimd-c8.c
similarity index 100%
rename from src/f32-bilinear/psimd-c8.c
rename to src/f32-bilinear/gen/psimd-c8.c
diff --git a/src/f32-bilinear/scalar-c1.c b/src/f32-bilinear/gen/scalar-c1.c
similarity index 100%
rename from src/f32-bilinear/scalar-c1.c
rename to src/f32-bilinear/gen/scalar-c1.c
diff --git a/src/f32-bilinear/scalar-c2.c b/src/f32-bilinear/gen/scalar-c2.c
similarity index 100%
rename from src/f32-bilinear/scalar-c2.c
rename to src/f32-bilinear/gen/scalar-c2.c
diff --git a/src/f32-bilinear/scalar-c4.c b/src/f32-bilinear/gen/scalar-c4.c
similarity index 100%
rename from src/f32-bilinear/scalar-c4.c
rename to src/f32-bilinear/gen/scalar-c4.c
diff --git a/src/f32-bilinear/sse-c4.c b/src/f32-bilinear/gen/sse-c4.c
similarity index 100%
rename from src/f32-bilinear/sse-c4.c
rename to src/f32-bilinear/gen/sse-c4.c
diff --git a/src/f32-bilinear/sse-c8.c b/src/f32-bilinear/gen/sse-c8.c
similarity index 100%
rename from src/f32-bilinear/sse-c8.c
rename to src/f32-bilinear/gen/sse-c8.c
diff --git a/src/f32-dwconv/up16x25-avx-acc2.c b/src/f32-dwconv/gen/up16x25-avx-acc2.c
similarity index 100%
rename from src/f32-dwconv/up16x25-avx-acc2.c
rename to src/f32-dwconv/gen/up16x25-avx-acc2.c
diff --git a/src/f32-dwconv/up16x25-avx.c b/src/f32-dwconv/gen/up16x25-avx.c
similarity index 100%
rename from src/f32-dwconv/up16x25-avx.c
rename to src/f32-dwconv/gen/up16x25-avx.c
diff --git a/src/f32-dwconv/up16x25-fma3-acc2.c b/src/f32-dwconv/gen/up16x25-fma3-acc2.c
similarity index 100%
rename from src/f32-dwconv/up16x25-fma3-acc2.c
rename to src/f32-dwconv/gen/up16x25-fma3-acc2.c
diff --git a/src/f32-dwconv/up16x25-fma3.c b/src/f32-dwconv/gen/up16x25-fma3.c
similarity index 100%
rename from src/f32-dwconv/up16x25-fma3.c
rename to src/f32-dwconv/gen/up16x25-fma3.c
diff --git a/src/f32-dwconv/up16x4-avx-acc2.c b/src/f32-dwconv/gen/up16x4-avx-acc2.c
similarity index 100%
rename from src/f32-dwconv/up16x4-avx-acc2.c
rename to src/f32-dwconv/gen/up16x4-avx-acc2.c
diff --git a/src/f32-dwconv/up16x4-avx.c b/src/f32-dwconv/gen/up16x4-avx.c
similarity index 100%
rename from src/f32-dwconv/up16x4-avx.c
rename to src/f32-dwconv/gen/up16x4-avx.c
diff --git a/src/f32-dwconv/up16x4-fma3-acc2.c b/src/f32-dwconv/gen/up16x4-fma3-acc2.c
similarity index 100%
rename from src/f32-dwconv/up16x4-fma3-acc2.c
rename to src/f32-dwconv/gen/up16x4-fma3-acc2.c
diff --git a/src/f32-dwconv/up16x4-fma3.c b/src/f32-dwconv/gen/up16x4-fma3.c
similarity index 100%
rename from src/f32-dwconv/up16x4-fma3.c
rename to src/f32-dwconv/gen/up16x4-fma3.c
diff --git a/src/f32-dwconv/up16x9-avx-acc2.c b/src/f32-dwconv/gen/up16x9-avx-acc2.c
similarity index 100%
rename from src/f32-dwconv/up16x9-avx-acc2.c
rename to src/f32-dwconv/gen/up16x9-avx-acc2.c
diff --git a/src/f32-dwconv/up16x9-avx.c b/src/f32-dwconv/gen/up16x9-avx.c
similarity index 100%
rename from src/f32-dwconv/up16x9-avx.c
rename to src/f32-dwconv/gen/up16x9-avx.c
diff --git a/src/f32-dwconv/up16x9-fma3-acc2.c b/src/f32-dwconv/gen/up16x9-fma3-acc2.c
similarity index 100%
rename from src/f32-dwconv/up16x9-fma3-acc2.c
rename to src/f32-dwconv/gen/up16x9-fma3-acc2.c
diff --git a/src/f32-dwconv/up16x9-fma3.c b/src/f32-dwconv/gen/up16x9-fma3.c
similarity index 100%
rename from src/f32-dwconv/up16x9-fma3.c
rename to src/f32-dwconv/gen/up16x9-fma3.c
diff --git a/src/f32-dwconv/up1x25-scalar-acc2.c b/src/f32-dwconv/gen/up1x25-scalar-acc2.c
similarity index 100%
rename from src/f32-dwconv/up1x25-scalar-acc2.c
rename to src/f32-dwconv/gen/up1x25-scalar-acc2.c
diff --git a/src/f32-dwconv/up1x25-scalar.c b/src/f32-dwconv/gen/up1x25-scalar.c
similarity index 100%
rename from src/f32-dwconv/up1x25-scalar.c
rename to src/f32-dwconv/gen/up1x25-scalar.c
diff --git a/src/f32-dwconv/up1x4-scalar-acc2.c b/src/f32-dwconv/gen/up1x4-scalar-acc2.c
similarity index 100%
rename from src/f32-dwconv/up1x4-scalar-acc2.c
rename to src/f32-dwconv/gen/up1x4-scalar-acc2.c
diff --git a/src/f32-dwconv/up1x4-scalar.c b/src/f32-dwconv/gen/up1x4-scalar.c
similarity index 100%
rename from src/f32-dwconv/up1x4-scalar.c
rename to src/f32-dwconv/gen/up1x4-scalar.c
diff --git a/src/f32-dwconv/up1x9-scalar-acc2.c b/src/f32-dwconv/gen/up1x9-scalar-acc2.c
similarity index 100%
rename from src/f32-dwconv/up1x9-scalar-acc2.c
rename to src/f32-dwconv/gen/up1x9-scalar-acc2.c
diff --git a/src/f32-dwconv/up1x9-scalar.c b/src/f32-dwconv/gen/up1x9-scalar.c
similarity index 100%
rename from src/f32-dwconv/up1x9-scalar.c
rename to src/f32-dwconv/gen/up1x9-scalar.c
diff --git a/src/f32-dwconv/up2x25-scalar-acc2.c b/src/f32-dwconv/gen/up2x25-scalar-acc2.c
similarity index 100%
rename from src/f32-dwconv/up2x25-scalar-acc2.c
rename to src/f32-dwconv/gen/up2x25-scalar-acc2.c
diff --git a/src/f32-dwconv/up2x25-scalar.c b/src/f32-dwconv/gen/up2x25-scalar.c
similarity index 100%
rename from src/f32-dwconv/up2x25-scalar.c
rename to src/f32-dwconv/gen/up2x25-scalar.c
diff --git a/src/f32-dwconv/up2x4-scalar-acc2.c b/src/f32-dwconv/gen/up2x4-scalar-acc2.c
similarity index 100%
rename from src/f32-dwconv/up2x4-scalar-acc2.c
rename to src/f32-dwconv/gen/up2x4-scalar-acc2.c
diff --git a/src/f32-dwconv/up2x4-scalar.c b/src/f32-dwconv/gen/up2x4-scalar.c
similarity index 100%
rename from src/f32-dwconv/up2x4-scalar.c
rename to src/f32-dwconv/gen/up2x4-scalar.c
diff --git a/src/f32-dwconv/up2x9-scalar-acc2.c b/src/f32-dwconv/gen/up2x9-scalar-acc2.c
similarity index 100%
rename from src/f32-dwconv/up2x9-scalar-acc2.c
rename to src/f32-dwconv/gen/up2x9-scalar-acc2.c
diff --git a/src/f32-dwconv/up2x9-scalar.c b/src/f32-dwconv/gen/up2x9-scalar.c
similarity index 100%
rename from src/f32-dwconv/up2x9-scalar.c
rename to src/f32-dwconv/gen/up2x9-scalar.c
diff --git a/src/f32-dwconv/up4x25-psimd-acc2.c b/src/f32-dwconv/gen/up4x25-psimd-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x25-psimd-acc2.c
rename to src/f32-dwconv/gen/up4x25-psimd-acc2.c
diff --git a/src/f32-dwconv/up4x25-psimd.c b/src/f32-dwconv/gen/up4x25-psimd.c
similarity index 100%
rename from src/f32-dwconv/up4x25-psimd.c
rename to src/f32-dwconv/gen/up4x25-psimd.c
diff --git a/src/f32-dwconv/up4x25-sse-acc2.c b/src/f32-dwconv/gen/up4x25-sse-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x25-sse-acc2.c
rename to src/f32-dwconv/gen/up4x25-sse-acc2.c
diff --git a/src/f32-dwconv/up4x25-sse.c b/src/f32-dwconv/gen/up4x25-sse.c
similarity index 100%
rename from src/f32-dwconv/up4x25-sse.c
rename to src/f32-dwconv/gen/up4x25-sse.c
diff --git a/src/f32-dwconv/up4x4-psimd-acc2.c b/src/f32-dwconv/gen/up4x4-psimd-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x4-psimd-acc2.c
rename to src/f32-dwconv/gen/up4x4-psimd-acc2.c
diff --git a/src/f32-dwconv/up4x4-psimd.c b/src/f32-dwconv/gen/up4x4-psimd.c
similarity index 100%
rename from src/f32-dwconv/up4x4-psimd.c
rename to src/f32-dwconv/gen/up4x4-psimd.c
diff --git a/src/f32-dwconv/up4x4-sse-acc2.c b/src/f32-dwconv/gen/up4x4-sse-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x4-sse-acc2.c
rename to src/f32-dwconv/gen/up4x4-sse-acc2.c
diff --git a/src/f32-dwconv/up4x4-sse.c b/src/f32-dwconv/gen/up4x4-sse.c
similarity index 100%
rename from src/f32-dwconv/up4x4-sse.c
rename to src/f32-dwconv/gen/up4x4-sse.c
diff --git a/src/f32-dwconv/up4x9-neon-acc2.c b/src/f32-dwconv/gen/up4x9-neon-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x9-neon-acc2.c
rename to src/f32-dwconv/gen/up4x9-neon-acc2.c
diff --git a/src/f32-dwconv/up4x9-neon.c b/src/f32-dwconv/gen/up4x9-neon.c
similarity index 100%
rename from src/f32-dwconv/up4x9-neon.c
rename to src/f32-dwconv/gen/up4x9-neon.c
diff --git a/src/f32-dwconv/up4x9-neonfma-acc2.c b/src/f32-dwconv/gen/up4x9-neonfma-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x9-neonfma-acc2.c
rename to src/f32-dwconv/gen/up4x9-neonfma-acc2.c
diff --git a/src/f32-dwconv/up4x9-neonfma.c b/src/f32-dwconv/gen/up4x9-neonfma.c
similarity index 100%
rename from src/f32-dwconv/up4x9-neonfma.c
rename to src/f32-dwconv/gen/up4x9-neonfma.c
diff --git a/src/f32-dwconv/up4x9-psimd-acc2.c b/src/f32-dwconv/gen/up4x9-psimd-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x9-psimd-acc2.c
rename to src/f32-dwconv/gen/up4x9-psimd-acc2.c
diff --git a/src/f32-dwconv/up4x9-psimd.c b/src/f32-dwconv/gen/up4x9-psimd.c
similarity index 100%
rename from src/f32-dwconv/up4x9-psimd.c
rename to src/f32-dwconv/gen/up4x9-psimd.c
diff --git a/src/f32-dwconv/up4x9-sse-acc2.c b/src/f32-dwconv/gen/up4x9-sse-acc2.c
similarity index 100%
rename from src/f32-dwconv/up4x9-sse-acc2.c
rename to src/f32-dwconv/gen/up4x9-sse-acc2.c
diff --git a/src/f32-dwconv/up4x9-sse.c b/src/f32-dwconv/gen/up4x9-sse.c
similarity index 100%
rename from src/f32-dwconv/up4x9-sse.c
rename to src/f32-dwconv/gen/up4x9-sse.c
diff --git a/src/f32-dwconv/up8x25-avx-acc2.c b/src/f32-dwconv/gen/up8x25-avx-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x25-avx-acc2.c
rename to src/f32-dwconv/gen/up8x25-avx-acc2.c
diff --git a/src/f32-dwconv/up8x25-avx.c b/src/f32-dwconv/gen/up8x25-avx.c
similarity index 100%
rename from src/f32-dwconv/up8x25-avx.c
rename to src/f32-dwconv/gen/up8x25-avx.c
diff --git a/src/f32-dwconv/up8x25-fma3-acc2.c b/src/f32-dwconv/gen/up8x25-fma3-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x25-fma3-acc2.c
rename to src/f32-dwconv/gen/up8x25-fma3-acc2.c
diff --git a/src/f32-dwconv/up8x25-fma3.c b/src/f32-dwconv/gen/up8x25-fma3.c
similarity index 100%
rename from src/f32-dwconv/up8x25-fma3.c
rename to src/f32-dwconv/gen/up8x25-fma3.c
diff --git a/src/f32-dwconv/up8x25-psimd-acc2.c b/src/f32-dwconv/gen/up8x25-psimd-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x25-psimd-acc2.c
rename to src/f32-dwconv/gen/up8x25-psimd-acc2.c
diff --git a/src/f32-dwconv/up8x25-psimd.c b/src/f32-dwconv/gen/up8x25-psimd.c
similarity index 100%
rename from src/f32-dwconv/up8x25-psimd.c
rename to src/f32-dwconv/gen/up8x25-psimd.c
diff --git a/src/f32-dwconv/up8x25-sse-acc2.c b/src/f32-dwconv/gen/up8x25-sse-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x25-sse-acc2.c
rename to src/f32-dwconv/gen/up8x25-sse-acc2.c
diff --git a/src/f32-dwconv/up8x25-sse.c b/src/f32-dwconv/gen/up8x25-sse.c
similarity index 100%
rename from src/f32-dwconv/up8x25-sse.c
rename to src/f32-dwconv/gen/up8x25-sse.c
diff --git a/src/f32-dwconv/up8x4-avx-acc2.c b/src/f32-dwconv/gen/up8x4-avx-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x4-avx-acc2.c
rename to src/f32-dwconv/gen/up8x4-avx-acc2.c
diff --git a/src/f32-dwconv/up8x4-avx.c b/src/f32-dwconv/gen/up8x4-avx.c
similarity index 100%
rename from src/f32-dwconv/up8x4-avx.c
rename to src/f32-dwconv/gen/up8x4-avx.c
diff --git a/src/f32-dwconv/up8x4-fma3-acc2.c b/src/f32-dwconv/gen/up8x4-fma3-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x4-fma3-acc2.c
rename to src/f32-dwconv/gen/up8x4-fma3-acc2.c
diff --git a/src/f32-dwconv/up8x4-fma3.c b/src/f32-dwconv/gen/up8x4-fma3.c
similarity index 100%
rename from src/f32-dwconv/up8x4-fma3.c
rename to src/f32-dwconv/gen/up8x4-fma3.c
diff --git a/src/f32-dwconv/up8x4-psimd-acc2.c b/src/f32-dwconv/gen/up8x4-psimd-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x4-psimd-acc2.c
rename to src/f32-dwconv/gen/up8x4-psimd-acc2.c
diff --git a/src/f32-dwconv/up8x4-psimd.c b/src/f32-dwconv/gen/up8x4-psimd.c
similarity index 100%
rename from src/f32-dwconv/up8x4-psimd.c
rename to src/f32-dwconv/gen/up8x4-psimd.c
diff --git a/src/f32-dwconv/up8x4-sse-acc2.c b/src/f32-dwconv/gen/up8x4-sse-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x4-sse-acc2.c
rename to src/f32-dwconv/gen/up8x4-sse-acc2.c
diff --git a/src/f32-dwconv/up8x4-sse.c b/src/f32-dwconv/gen/up8x4-sse.c
similarity index 100%
rename from src/f32-dwconv/up8x4-sse.c
rename to src/f32-dwconv/gen/up8x4-sse.c
diff --git a/src/f32-dwconv/up8x9-avx-acc2.c b/src/f32-dwconv/gen/up8x9-avx-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x9-avx-acc2.c
rename to src/f32-dwconv/gen/up8x9-avx-acc2.c
diff --git a/src/f32-dwconv/up8x9-avx.c b/src/f32-dwconv/gen/up8x9-avx.c
similarity index 100%
rename from src/f32-dwconv/up8x9-avx.c
rename to src/f32-dwconv/gen/up8x9-avx.c
diff --git a/src/f32-dwconv/up8x9-fma3-acc2.c b/src/f32-dwconv/gen/up8x9-fma3-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x9-fma3-acc2.c
rename to src/f32-dwconv/gen/up8x9-fma3-acc2.c
diff --git a/src/f32-dwconv/up8x9-fma3.c b/src/f32-dwconv/gen/up8x9-fma3.c
similarity index 100%
rename from src/f32-dwconv/up8x9-fma3.c
rename to src/f32-dwconv/gen/up8x9-fma3.c
diff --git a/src/f32-dwconv/up8x9-neon-acc2.c b/src/f32-dwconv/gen/up8x9-neon-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x9-neon-acc2.c
rename to src/f32-dwconv/gen/up8x9-neon-acc2.c
diff --git a/src/f32-dwconv/up8x9-neon.c b/src/f32-dwconv/gen/up8x9-neon.c
similarity index 100%
rename from src/f32-dwconv/up8x9-neon.c
rename to src/f32-dwconv/gen/up8x9-neon.c
diff --git a/src/f32-dwconv/up8x9-neonfma-acc2.c b/src/f32-dwconv/gen/up8x9-neonfma-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x9-neonfma-acc2.c
rename to src/f32-dwconv/gen/up8x9-neonfma-acc2.c
diff --git a/src/f32-dwconv/up8x9-neonfma.c b/src/f32-dwconv/gen/up8x9-neonfma.c
similarity index 100%
rename from src/f32-dwconv/up8x9-neonfma.c
rename to src/f32-dwconv/gen/up8x9-neonfma.c
diff --git a/src/f32-dwconv/up8x9-psimd-acc2.c b/src/f32-dwconv/gen/up8x9-psimd-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x9-psimd-acc2.c
rename to src/f32-dwconv/gen/up8x9-psimd-acc2.c
diff --git a/src/f32-dwconv/up8x9-psimd.c b/src/f32-dwconv/gen/up8x9-psimd.c
similarity index 100%
rename from src/f32-dwconv/up8x9-psimd.c
rename to src/f32-dwconv/gen/up8x9-psimd.c
diff --git a/src/f32-dwconv/up8x9-sse-acc2.c b/src/f32-dwconv/gen/up8x9-sse-acc2.c
similarity index 100%
rename from src/f32-dwconv/up8x9-sse-acc2.c
rename to src/f32-dwconv/gen/up8x9-sse-acc2.c
diff --git a/src/f32-dwconv/up8x9-sse.c b/src/f32-dwconv/gen/up8x9-sse.c
similarity index 100%
rename from src/f32-dwconv/up8x9-sse.c
rename to src/f32-dwconv/gen/up8x9-sse.c
diff --git a/src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemminc/1x12-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemminc/1x4-scalar.c b/src/f32-gemm/gen-inc/1x4-scalar.c
similarity index 100%
rename from src/f32-gemminc/1x4-scalar.c
rename to src/f32-gemm/gen-inc/1x4-scalar.c
diff --git a/src/f32-gemminc/1x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemminc/1x8-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
similarity index 100%
rename from src/f32-gemminc/1x8-aarch64-neonfma-cortex-a57.S
rename to src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemminc/1x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen-inc/1x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemminc/1x8-avx-broadcast.c b/src/f32-gemm/gen-inc/1x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemminc/1x8-avx-broadcast.c
rename to src/f32-gemm/gen-inc/1x8-avx-broadcast.c
diff --git a/src/f32-gemminc/1x8-fma3-broadcast.c b/src/f32-gemm/gen-inc/1x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemminc/1x8-fma3-broadcast.c
rename to src/f32-gemm/gen-inc/1x8-fma3-broadcast.c
diff --git a/src/f32-gemminc/1x8-neon-dup-ld64.c b/src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-gemminc/1x8-neon-dup-ld64.c
rename to src/f32-gemm/gen-inc/1x8-neon-dup-ld64.c
diff --git a/src/f32-gemminc/1x8-neon-lane-ld64.c b/src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/1x8-neon-lane-ld64.c
rename to src/f32-gemm/gen-inc/1x8-neon-lane-ld64.c
diff --git a/src/f32-gemminc/1x8-neonfma-dup-ld64.c b/src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-gemminc/1x8-neonfma-dup-ld64.c
rename to src/f32-gemm/gen-inc/1x8-neonfma-dup-ld64.c
diff --git a/src/f32-gemminc/1x8-neonfma-lane-ld64.c b/src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/1x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen-inc/1x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemminc/1x8-psimd-loadsplat.c b/src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-gemminc/1x8-psimd-loadsplat.c
rename to src/f32-gemm/gen-inc/1x8-psimd-loadsplat.c
diff --git a/src/f32-gemminc/1x8-psimd-splat.c b/src/f32-gemm/gen-inc/1x8-psimd-splat.c
similarity index 100%
rename from src/f32-gemminc/1x8-psimd-splat.c
rename to src/f32-gemm/gen-inc/1x8-psimd-splat.c
diff --git a/src/f32-gemminc/1x8-sse-dup.c b/src/f32-gemm/gen-inc/1x8-sse-dup.c
similarity index 100%
rename from src/f32-gemminc/1x8-sse-dup.c
rename to src/f32-gemm/gen-inc/1x8-sse-dup.c
diff --git a/src/f32-gemminc/1x8-sse-load1.c b/src/f32-gemm/gen-inc/1x8-sse-load1.c
similarity index 100%
rename from src/f32-gemminc/1x8-sse-load1.c
rename to src/f32-gemm/gen-inc/1x8-sse-load1.c
diff --git a/src/f32-gemminc/1x8s4-neon.c b/src/f32-gemm/gen-inc/1x8s4-neon.c
similarity index 100%
rename from src/f32-gemminc/1x8s4-neon.c
rename to src/f32-gemm/gen-inc/1x8s4-neon.c
diff --git a/src/f32-gemminc/1x8s4-neonfma.c b/src/f32-gemm/gen-inc/1x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemminc/1x8s4-neonfma.c
rename to src/f32-gemm/gen-inc/1x8s4-neonfma.c
diff --git a/src/f32-gemminc/1x8s4-psimd.c b/src/f32-gemm/gen-inc/1x8s4-psimd.c
similarity index 100%
rename from src/f32-gemminc/1x8s4-psimd.c
rename to src/f32-gemm/gen-inc/1x8s4-psimd.c
diff --git a/src/f32-gemminc/1x8s4-sse.c b/src/f32-gemm/gen-inc/1x8s4-sse.c
similarity index 100%
rename from src/f32-gemminc/1x8s4-sse.c
rename to src/f32-gemm/gen-inc/1x8s4-sse.c
diff --git a/src/f32-gemminc/2x4-scalar.c b/src/f32-gemm/gen-inc/2x4-scalar.c
similarity index 100%
rename from src/f32-gemminc/2x4-scalar.c
rename to src/f32-gemm/gen-inc/2x4-scalar.c
diff --git a/src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemminc/4x12-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen-inc/4x12-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemminc/4x4-scalar.c b/src/f32-gemm/gen-inc/4x4-scalar.c
similarity index 100%
rename from src/f32-gemminc/4x4-scalar.c
rename to src/f32-gemm/gen-inc/4x4-scalar.c
diff --git a/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemminc/4x8-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
similarity index 100%
rename from src/f32-gemminc/4x8-aarch64-neonfma-cortex-a57.S
rename to src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemminc/4x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen-inc/4x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemminc/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
similarity index 100%
rename from src/f32-gemminc/4x8-aarch64-neonfma-ld128.S
rename to src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
diff --git a/src/f32-gemminc/4x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
similarity index 100%
rename from src/f32-gemminc/4x8-aarch64-neonfma-ld64.S
rename to src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld64.S
diff --git a/src/f32-gemminc/4x8-avx-broadcast.c b/src/f32-gemm/gen-inc/4x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemminc/4x8-avx-broadcast.c
rename to src/f32-gemm/gen-inc/4x8-avx-broadcast.c
diff --git a/src/f32-gemminc/4x8-fma3-broadcast.c b/src/f32-gemm/gen-inc/4x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemminc/4x8-fma3-broadcast.c
rename to src/f32-gemm/gen-inc/4x8-fma3-broadcast.c
diff --git a/src/f32-gemminc/4x8-neon-dup-ld128.c b/src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c
similarity index 100%
rename from src/f32-gemminc/4x8-neon-dup-ld128.c
rename to src/f32-gemm/gen-inc/4x8-neon-dup-ld128.c
diff --git a/src/f32-gemminc/4x8-neon-dup-ld64.c b/src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-gemminc/4x8-neon-dup-ld64.c
rename to src/f32-gemm/gen-inc/4x8-neon-dup-ld64.c
diff --git a/src/f32-gemminc/4x8-neon-lane-ld128.c b/src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c
similarity index 100%
rename from src/f32-gemminc/4x8-neon-lane-ld128.c
rename to src/f32-gemm/gen-inc/4x8-neon-lane-ld128.c
diff --git a/src/f32-gemminc/4x8-neon-lane-ld64.c b/src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/4x8-neon-lane-ld64.c
rename to src/f32-gemm/gen-inc/4x8-neon-lane-ld64.c
diff --git a/src/f32-gemminc/4x8-neonfma-dup-ld128.c b/src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c
similarity index 100%
rename from src/f32-gemminc/4x8-neonfma-dup-ld128.c
rename to src/f32-gemm/gen-inc/4x8-neonfma-dup-ld128.c
diff --git a/src/f32-gemminc/4x8-neonfma-dup-ld64.c b/src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-gemminc/4x8-neonfma-dup-ld64.c
rename to src/f32-gemm/gen-inc/4x8-neonfma-dup-ld64.c
diff --git a/src/f32-gemminc/4x8-neonfma-lane-ld128.c b/src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c
similarity index 100%
rename from src/f32-gemminc/4x8-neonfma-lane-ld128.c
rename to src/f32-gemm/gen-inc/4x8-neonfma-lane-ld128.c
diff --git a/src/f32-gemminc/4x8-neonfma-lane-ld64.c b/src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/4x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen-inc/4x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemminc/4x8-psimd-loadsplat.c b/src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-gemminc/4x8-psimd-loadsplat.c
rename to src/f32-gemm/gen-inc/4x8-psimd-loadsplat.c
diff --git a/src/f32-gemminc/4x8-psimd-splat.c b/src/f32-gemm/gen-inc/4x8-psimd-splat.c
similarity index 100%
rename from src/f32-gemminc/4x8-psimd-splat.c
rename to src/f32-gemm/gen-inc/4x8-psimd-splat.c
diff --git a/src/f32-gemminc/4x8-sse-dup.c b/src/f32-gemm/gen-inc/4x8-sse-dup.c
similarity index 100%
rename from src/f32-gemminc/4x8-sse-dup.c
rename to src/f32-gemm/gen-inc/4x8-sse-dup.c
diff --git a/src/f32-gemminc/4x8-sse-load1.c b/src/f32-gemm/gen-inc/4x8-sse-load1.c
similarity index 100%
rename from src/f32-gemminc/4x8-sse-load1.c
rename to src/f32-gemm/gen-inc/4x8-sse-load1.c
diff --git a/src/f32-gemminc/4x8s4-neon.c b/src/f32-gemm/gen-inc/4x8s4-neon.c
similarity index 100%
rename from src/f32-gemminc/4x8s4-neon.c
rename to src/f32-gemm/gen-inc/4x8s4-neon.c
diff --git a/src/f32-gemminc/4x8s4-neonfma.c b/src/f32-gemm/gen-inc/4x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemminc/4x8s4-neonfma.c
rename to src/f32-gemm/gen-inc/4x8s4-neonfma.c
diff --git a/src/f32-gemminc/4x8s4-psimd.c b/src/f32-gemm/gen-inc/4x8s4-psimd.c
similarity index 100%
rename from src/f32-gemminc/4x8s4-psimd.c
rename to src/f32-gemm/gen-inc/4x8s4-psimd.c
diff --git a/src/f32-gemminc/4x8s4-sse.c b/src/f32-gemm/gen-inc/4x8s4-sse.c
similarity index 100%
rename from src/f32-gemminc/4x8s4-sse.c
rename to src/f32-gemm/gen-inc/4x8s4-sse.c
diff --git a/src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemminc/5x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen-inc/5x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemminc/5x8-avx-broadcast.c b/src/f32-gemm/gen-inc/5x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemminc/5x8-avx-broadcast.c
rename to src/f32-gemm/gen-inc/5x8-avx-broadcast.c
diff --git a/src/f32-gemminc/5x8-fma3-broadcast.c b/src/f32-gemm/gen-inc/5x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemminc/5x8-fma3-broadcast.c
rename to src/f32-gemm/gen-inc/5x8-fma3-broadcast.c
diff --git a/src/f32-gemminc/5x8-neon-lane-ld64.c b/src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/5x8-neon-lane-ld64.c
rename to src/f32-gemm/gen-inc/5x8-neon-lane-ld64.c
diff --git a/src/f32-gemminc/5x8-neonfma-lane-ld64.c b/src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/5x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen-inc/5x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemminc/6x8-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
similarity index 100%
rename from src/f32-gemminc/6x8-aarch64-neonfma-cortex-a57.S
rename to src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
similarity index 100%
rename from src/f32-gemminc/6x8-aarch64-neonfma-cortex-a73.S
rename to src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemminc/6x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
similarity index 100%
rename from src/f32-gemminc/6x8-aarch64-neonfma-ld128.S
rename to src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
diff --git a/src/f32-gemminc/6x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
similarity index 100%
rename from src/f32-gemminc/6x8-aarch64-neonfma-ld64.S
rename to src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
diff --git a/src/f32-gemminc/6x8-avx-broadcast.c b/src/f32-gemm/gen-inc/6x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemminc/6x8-avx-broadcast.c
rename to src/f32-gemm/gen-inc/6x8-avx-broadcast.c
diff --git a/src/f32-gemminc/6x8-fma3-broadcast.c b/src/f32-gemm/gen-inc/6x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemminc/6x8-fma3-broadcast.c
rename to src/f32-gemm/gen-inc/6x8-fma3-broadcast.c
diff --git a/src/f32-gemminc/6x8-neon-dup-ld64.c b/src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-gemminc/6x8-neon-dup-ld64.c
rename to src/f32-gemm/gen-inc/6x8-neon-dup-ld64.c
diff --git a/src/f32-gemminc/6x8-neon-lane-ld64.c b/src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/6x8-neon-lane-ld64.c
rename to src/f32-gemm/gen-inc/6x8-neon-lane-ld64.c
diff --git a/src/f32-gemminc/6x8-neonfma-dup-ld64.c b/src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-gemminc/6x8-neonfma-dup-ld64.c
rename to src/f32-gemm/gen-inc/6x8-neonfma-dup-ld64.c
diff --git a/src/f32-gemminc/6x8-neonfma-lane-ld64.c b/src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemminc/6x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen-inc/6x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemminc/6x8-psimd-loadsplat.c b/src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-gemminc/6x8-psimd-loadsplat.c
rename to src/f32-gemm/gen-inc/6x8-psimd-loadsplat.c
diff --git a/src/f32-gemminc/6x8-psimd-splat.c b/src/f32-gemm/gen-inc/6x8-psimd-splat.c
similarity index 100%
rename from src/f32-gemminc/6x8-psimd-splat.c
rename to src/f32-gemm/gen-inc/6x8-psimd-splat.c
diff --git a/src/f32-gemminc/6x8s4-neon.c b/src/f32-gemm/gen-inc/6x8s4-neon.c
similarity index 100%
rename from src/f32-gemminc/6x8s4-neon.c
rename to src/f32-gemm/gen-inc/6x8s4-neon.c
diff --git a/src/f32-gemminc/6x8s4-neonfma.c b/src/f32-gemm/gen-inc/6x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemminc/6x8s4-neonfma.c
rename to src/f32-gemm/gen-inc/6x8s4-neonfma.c
diff --git a/src/f32-gemminc/6x8s4-psimd.c b/src/f32-gemm/gen-inc/6x8s4-psimd.c
similarity index 100%
rename from src/f32-gemminc/6x8s4-psimd.c
rename to src/f32-gemm/gen-inc/6x8s4-psimd.c
diff --git a/src/f32-gemminc/7x8-avx-broadcast.c b/src/f32-gemm/gen-inc/7x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemminc/7x8-avx-broadcast.c
rename to src/f32-gemm/gen-inc/7x8-avx-broadcast.c
diff --git a/src/f32-gemminc/7x8-fma3-broadcast.c b/src/f32-gemm/gen-inc/7x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemminc/7x8-fma3-broadcast.c
rename to src/f32-gemm/gen-inc/7x8-fma3-broadcast.c
diff --git a/src/f32-gemminc/8x8-fma3-broadcast.c b/src/f32-gemm/gen-inc/8x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemminc/8x8-fma3-broadcast.c
rename to src/f32-gemm/gen-inc/8x8-fma3-broadcast.c
diff --git a/src/f32-gemminc/8x8s4-neon.c b/src/f32-gemm/gen-inc/8x8s4-neon.c
similarity index 100%
rename from src/f32-gemminc/8x8s4-neon.c
rename to src/f32-gemm/gen-inc/8x8s4-neon.c
diff --git a/src/f32-gemminc/8x8s4-neonfma.c b/src/f32-gemm/gen-inc/8x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemminc/8x8s4-neonfma.c
rename to src/f32-gemm/gen-inc/8x8s4-neonfma.c
diff --git a/src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen/1x12-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemm/1x4-scalar.c b/src/f32-gemm/gen/1x4-scalar.c
similarity index 100%
rename from src/f32-gemm/1x4-scalar.c
rename to src/f32-gemm/gen/1x4-scalar.c
diff --git a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemm/1x8-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
similarity index 100%
rename from src/f32-gemm/1x8-aarch64-neonfma-cortex-a57.S
rename to src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen/1x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemm/1x8-avx-broadcast.c b/src/f32-gemm/gen/1x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemm/1x8-avx-broadcast.c
rename to src/f32-gemm/gen/1x8-avx-broadcast.c
diff --git a/src/f32-gemm/1x8-fma3-broadcast.c b/src/f32-gemm/gen/1x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemm/1x8-fma3-broadcast.c
rename to src/f32-gemm/gen/1x8-fma3-broadcast.c
diff --git a/src/f32-gemm/1x8-neon-dup-ld64.c b/src/f32-gemm/gen/1x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-gemm/1x8-neon-dup-ld64.c
rename to src/f32-gemm/gen/1x8-neon-dup-ld64.c
diff --git a/src/f32-gemm/1x8-neon-lane-ld64.c b/src/f32-gemm/gen/1x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/1x8-neon-lane-ld64.c
rename to src/f32-gemm/gen/1x8-neon-lane-ld64.c
diff --git a/src/f32-gemm/1x8-neonfma-dup-ld64.c b/src/f32-gemm/gen/1x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-gemm/1x8-neonfma-dup-ld64.c
rename to src/f32-gemm/gen/1x8-neonfma-dup-ld64.c
diff --git a/src/f32-gemm/1x8-neonfma-lane-ld64.c b/src/f32-gemm/gen/1x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/1x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen/1x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemm/1x8-psimd-loadsplat.c b/src/f32-gemm/gen/1x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-gemm/1x8-psimd-loadsplat.c
rename to src/f32-gemm/gen/1x8-psimd-loadsplat.c
diff --git a/src/f32-gemm/1x8-psimd-splat.c b/src/f32-gemm/gen/1x8-psimd-splat.c
similarity index 100%
rename from src/f32-gemm/1x8-psimd-splat.c
rename to src/f32-gemm/gen/1x8-psimd-splat.c
diff --git a/src/f32-gemm/1x8-sse-dup.c b/src/f32-gemm/gen/1x8-sse-dup.c
similarity index 100%
rename from src/f32-gemm/1x8-sse-dup.c
rename to src/f32-gemm/gen/1x8-sse-dup.c
diff --git a/src/f32-gemm/1x8-sse-load1.c b/src/f32-gemm/gen/1x8-sse-load1.c
similarity index 100%
rename from src/f32-gemm/1x8-sse-load1.c
rename to src/f32-gemm/gen/1x8-sse-load1.c
diff --git a/src/f32-gemm/1x8s4-neon.c b/src/f32-gemm/gen/1x8s4-neon.c
similarity index 100%
rename from src/f32-gemm/1x8s4-neon.c
rename to src/f32-gemm/gen/1x8s4-neon.c
diff --git a/src/f32-gemm/1x8s4-neonfma.c b/src/f32-gemm/gen/1x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemm/1x8s4-neonfma.c
rename to src/f32-gemm/gen/1x8s4-neonfma.c
diff --git a/src/f32-gemm/1x8s4-psimd.c b/src/f32-gemm/gen/1x8s4-psimd.c
similarity index 100%
rename from src/f32-gemm/1x8s4-psimd.c
rename to src/f32-gemm/gen/1x8s4-psimd.c
diff --git a/src/f32-gemm/1x8s4-sse.c b/src/f32-gemm/gen/1x8s4-sse.c
similarity index 100%
rename from src/f32-gemm/1x8s4-sse.c
rename to src/f32-gemm/gen/1x8s4-sse.c
diff --git a/src/f32-gemm/2x4-scalar.c b/src/f32-gemm/gen/2x4-scalar.c
similarity index 100%
rename from src/f32-gemm/2x4-scalar.c
rename to src/f32-gemm/gen/2x4-scalar.c
diff --git a/src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemm/4x12-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen/4x12-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemm/4x2-neon-lane-ld64.c b/src/f32-gemm/gen/4x2-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/4x2-neon-lane-ld64.c
rename to src/f32-gemm/gen/4x2-neon-lane-ld64.c
diff --git a/src/f32-gemm/4x2-neonfma-lane-ld64.c b/src/f32-gemm/gen/4x2-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/4x2-neonfma-lane-ld64.c
rename to src/f32-gemm/gen/4x2-neonfma-lane-ld64.c
diff --git a/src/f32-gemm/4x2-scalar.c b/src/f32-gemm/gen/4x2-scalar.c
similarity index 100%
rename from src/f32-gemm/4x2-scalar.c
rename to src/f32-gemm/gen/4x2-scalar.c
diff --git a/src/f32-gemm/4x4-scalar.c b/src/f32-gemm/gen/4x4-scalar.c
similarity index 100%
rename from src/f32-gemm/4x4-scalar.c
rename to src/f32-gemm/gen/4x4-scalar.c
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
similarity index 100%
rename from src/f32-gemm/4x8-aarch64-neonfma-cortex-a57.S
rename to src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen/4x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
similarity index 100%
rename from src/f32-gemm/4x8-aarch64-neonfma-ld128.S
rename to src/f32-gemm/gen/4x8-aarch64-neonfma-ld128.S
diff --git a/src/f32-gemm/4x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
similarity index 100%
rename from src/f32-gemm/4x8-aarch64-neonfma-ld64.S
rename to src/f32-gemm/gen/4x8-aarch64-neonfma-ld64.S
diff --git a/src/f32-gemm/4x8-avx-broadcast.c b/src/f32-gemm/gen/4x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemm/4x8-avx-broadcast.c
rename to src/f32-gemm/gen/4x8-avx-broadcast.c
diff --git a/src/f32-gemm/4x8-fma3-broadcast.c b/src/f32-gemm/gen/4x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemm/4x8-fma3-broadcast.c
rename to src/f32-gemm/gen/4x8-fma3-broadcast.c
diff --git a/src/f32-gemm/4x8-neon-dup-ld128.c b/src/f32-gemm/gen/4x8-neon-dup-ld128.c
similarity index 100%
rename from src/f32-gemm/4x8-neon-dup-ld128.c
rename to src/f32-gemm/gen/4x8-neon-dup-ld128.c
diff --git a/src/f32-gemm/4x8-neon-dup-ld64.c b/src/f32-gemm/gen/4x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-gemm/4x8-neon-dup-ld64.c
rename to src/f32-gemm/gen/4x8-neon-dup-ld64.c
diff --git a/src/f32-gemm/4x8-neon-lane-ld128.c b/src/f32-gemm/gen/4x8-neon-lane-ld128.c
similarity index 100%
rename from src/f32-gemm/4x8-neon-lane-ld128.c
rename to src/f32-gemm/gen/4x8-neon-lane-ld128.c
diff --git a/src/f32-gemm/4x8-neon-lane-ld64.c b/src/f32-gemm/gen/4x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/4x8-neon-lane-ld64.c
rename to src/f32-gemm/gen/4x8-neon-lane-ld64.c
diff --git a/src/f32-gemm/4x8-neonfma-dup-ld128.c b/src/f32-gemm/gen/4x8-neonfma-dup-ld128.c
similarity index 100%
rename from src/f32-gemm/4x8-neonfma-dup-ld128.c
rename to src/f32-gemm/gen/4x8-neonfma-dup-ld128.c
diff --git a/src/f32-gemm/4x8-neonfma-dup-ld64.c b/src/f32-gemm/gen/4x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-gemm/4x8-neonfma-dup-ld64.c
rename to src/f32-gemm/gen/4x8-neonfma-dup-ld64.c
diff --git a/src/f32-gemm/4x8-neonfma-lane-ld128.c b/src/f32-gemm/gen/4x8-neonfma-lane-ld128.c
similarity index 100%
rename from src/f32-gemm/4x8-neonfma-lane-ld128.c
rename to src/f32-gemm/gen/4x8-neonfma-lane-ld128.c
diff --git a/src/f32-gemm/4x8-neonfma-lane-ld64.c b/src/f32-gemm/gen/4x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/4x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen/4x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemm/4x8-psimd-loadsplat.c b/src/f32-gemm/gen/4x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-gemm/4x8-psimd-loadsplat.c
rename to src/f32-gemm/gen/4x8-psimd-loadsplat.c
diff --git a/src/f32-gemm/4x8-psimd-splat.c b/src/f32-gemm/gen/4x8-psimd-splat.c
similarity index 100%
rename from src/f32-gemm/4x8-psimd-splat.c
rename to src/f32-gemm/gen/4x8-psimd-splat.c
diff --git a/src/f32-gemm/4x8-sse-dup.c b/src/f32-gemm/gen/4x8-sse-dup.c
similarity index 100%
rename from src/f32-gemm/4x8-sse-dup.c
rename to src/f32-gemm/gen/4x8-sse-dup.c
diff --git a/src/f32-gemm/4x8-sse-load1.c b/src/f32-gemm/gen/4x8-sse-load1.c
similarity index 100%
rename from src/f32-gemm/4x8-sse-load1.c
rename to src/f32-gemm/gen/4x8-sse-load1.c
diff --git a/src/f32-gemm/4x8s4-neon.c b/src/f32-gemm/gen/4x8s4-neon.c
similarity index 100%
rename from src/f32-gemm/4x8s4-neon.c
rename to src/f32-gemm/gen/4x8s4-neon.c
diff --git a/src/f32-gemm/4x8s4-neonfma.c b/src/f32-gemm/gen/4x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemm/4x8s4-neonfma.c
rename to src/f32-gemm/gen/4x8s4-neonfma.c
diff --git a/src/f32-gemm/4x8s4-psimd.c b/src/f32-gemm/gen/4x8s4-psimd.c
similarity index 100%
rename from src/f32-gemm/4x8s4-psimd.c
rename to src/f32-gemm/gen/4x8s4-psimd.c
diff --git a/src/f32-gemm/4x8s4-sse.c b/src/f32-gemm/gen/4x8s4-sse.c
similarity index 100%
rename from src/f32-gemm/4x8s4-sse.c
rename to src/f32-gemm/gen/4x8s4-sse.c
diff --git a/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen/5x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemm/5x8-avx-broadcast.c b/src/f32-gemm/gen/5x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemm/5x8-avx-broadcast.c
rename to src/f32-gemm/gen/5x8-avx-broadcast.c
diff --git a/src/f32-gemm/5x8-fma3-broadcast.c b/src/f32-gemm/gen/5x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemm/5x8-fma3-broadcast.c
rename to src/f32-gemm/gen/5x8-fma3-broadcast.c
diff --git a/src/f32-gemm/5x8-neon-lane-ld64.c b/src/f32-gemm/gen/5x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/5x8-neon-lane-ld64.c
rename to src/f32-gemm/gen/5x8-neon-lane-ld64.c
diff --git a/src/f32-gemm/5x8-neonfma-lane-ld64.c b/src/f32-gemm/gen/5x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/5x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen/5x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
similarity index 100%
rename from src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S
rename to src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a53.S
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
similarity index 100%
rename from src/f32-gemm/6x8-aarch64-neonfma-cortex-a57.S
rename to src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
similarity index 100%
rename from src/f32-gemm/6x8-aarch64-neonfma-cortex-a73.S
rename to src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
similarity index 100%
rename from src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
rename to src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
similarity index 100%
rename from src/f32-gemm/6x8-aarch64-neonfma-ld128.S
rename to src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
similarity index 100%
rename from src/f32-gemm/6x8-aarch64-neonfma-ld64.S
rename to src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
diff --git a/src/f32-gemm/6x8-avx-broadcast.c b/src/f32-gemm/gen/6x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemm/6x8-avx-broadcast.c
rename to src/f32-gemm/gen/6x8-avx-broadcast.c
diff --git a/src/f32-gemm/6x8-fma3-broadcast.c b/src/f32-gemm/gen/6x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemm/6x8-fma3-broadcast.c
rename to src/f32-gemm/gen/6x8-fma3-broadcast.c
diff --git a/src/f32-gemm/6x8-neon-dup-ld64.c b/src/f32-gemm/gen/6x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-gemm/6x8-neon-dup-ld64.c
rename to src/f32-gemm/gen/6x8-neon-dup-ld64.c
diff --git a/src/f32-gemm/6x8-neon-lane-ld64.c b/src/f32-gemm/gen/6x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/6x8-neon-lane-ld64.c
rename to src/f32-gemm/gen/6x8-neon-lane-ld64.c
diff --git a/src/f32-gemm/6x8-neonfma-dup-ld64.c b/src/f32-gemm/gen/6x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-gemm/6x8-neonfma-dup-ld64.c
rename to src/f32-gemm/gen/6x8-neonfma-dup-ld64.c
diff --git a/src/f32-gemm/6x8-neonfma-lane-ld64.c b/src/f32-gemm/gen/6x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-gemm/6x8-neonfma-lane-ld64.c
rename to src/f32-gemm/gen/6x8-neonfma-lane-ld64.c
diff --git a/src/f32-gemm/6x8-psimd-loadsplat.c b/src/f32-gemm/gen/6x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-gemm/6x8-psimd-loadsplat.c
rename to src/f32-gemm/gen/6x8-psimd-loadsplat.c
diff --git a/src/f32-gemm/6x8-psimd-splat.c b/src/f32-gemm/gen/6x8-psimd-splat.c
similarity index 100%
rename from src/f32-gemm/6x8-psimd-splat.c
rename to src/f32-gemm/gen/6x8-psimd-splat.c
diff --git a/src/f32-gemm/6x8s4-neon.c b/src/f32-gemm/gen/6x8s4-neon.c
similarity index 100%
rename from src/f32-gemm/6x8s4-neon.c
rename to src/f32-gemm/gen/6x8s4-neon.c
diff --git a/src/f32-gemm/6x8s4-neonfma.c b/src/f32-gemm/gen/6x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemm/6x8s4-neonfma.c
rename to src/f32-gemm/gen/6x8s4-neonfma.c
diff --git a/src/f32-gemm/6x8s4-psimd.c b/src/f32-gemm/gen/6x8s4-psimd.c
similarity index 100%
rename from src/f32-gemm/6x8s4-psimd.c
rename to src/f32-gemm/gen/6x8s4-psimd.c
diff --git a/src/f32-gemm/7x8-avx-broadcast.c b/src/f32-gemm/gen/7x8-avx-broadcast.c
similarity index 100%
rename from src/f32-gemm/7x8-avx-broadcast.c
rename to src/f32-gemm/gen/7x8-avx-broadcast.c
diff --git a/src/f32-gemm/7x8-fma3-broadcast.c b/src/f32-gemm/gen/7x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemm/7x8-fma3-broadcast.c
rename to src/f32-gemm/gen/7x8-fma3-broadcast.c
diff --git a/src/f32-gemm/8x8-fma3-broadcast.c b/src/f32-gemm/gen/8x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-gemm/8x8-fma3-broadcast.c
rename to src/f32-gemm/gen/8x8-fma3-broadcast.c
diff --git a/src/f32-gemm/8x8s4-neon.c b/src/f32-gemm/gen/8x8s4-neon.c
similarity index 100%
rename from src/f32-gemm/8x8s4-neon.c
rename to src/f32-gemm/gen/8x8s4-neon.c
diff --git a/src/f32-gemm/8x8s4-neonfma.c b/src/f32-gemm/gen/8x8s4-neonfma.c
similarity index 100%
rename from src/f32-gemm/8x8s4-neonfma.c
rename to src/f32-gemm/gen/8x8s4-neonfma.c
diff --git a/src/f32-igemm/1x4-scalar.c b/src/f32-igemm/gen/1x4-scalar.c
similarity index 100%
rename from src/f32-igemm/1x4-scalar.c
rename to src/f32-igemm/gen/1x4-scalar.c
diff --git a/src/f32-igemm/1x8-avx-broadcast.c b/src/f32-igemm/gen/1x8-avx-broadcast.c
similarity index 100%
rename from src/f32-igemm/1x8-avx-broadcast.c
rename to src/f32-igemm/gen/1x8-avx-broadcast.c
diff --git a/src/f32-igemm/1x8-fma3-broadcast.c b/src/f32-igemm/gen/1x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-igemm/1x8-fma3-broadcast.c
rename to src/f32-igemm/gen/1x8-fma3-broadcast.c
diff --git a/src/f32-igemm/1x8-neon-dup-ld64.c b/src/f32-igemm/gen/1x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-igemm/1x8-neon-dup-ld64.c
rename to src/f32-igemm/gen/1x8-neon-dup-ld64.c
diff --git a/src/f32-igemm/1x8-neon-lane-ld64.c b/src/f32-igemm/gen/1x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/1x8-neon-lane-ld64.c
rename to src/f32-igemm/gen/1x8-neon-lane-ld64.c
diff --git a/src/f32-igemm/1x8-neonfma-dup-ld64.c b/src/f32-igemm/gen/1x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-igemm/1x8-neonfma-dup-ld64.c
rename to src/f32-igemm/gen/1x8-neonfma-dup-ld64.c
diff --git a/src/f32-igemm/1x8-neonfma-lane-ld64.c b/src/f32-igemm/gen/1x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/1x8-neonfma-lane-ld64.c
rename to src/f32-igemm/gen/1x8-neonfma-lane-ld64.c
diff --git a/src/f32-igemm/1x8-psimd-loadsplat.c b/src/f32-igemm/gen/1x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-igemm/1x8-psimd-loadsplat.c
rename to src/f32-igemm/gen/1x8-psimd-loadsplat.c
diff --git a/src/f32-igemm/1x8-psimd-splat.c b/src/f32-igemm/gen/1x8-psimd-splat.c
similarity index 100%
rename from src/f32-igemm/1x8-psimd-splat.c
rename to src/f32-igemm/gen/1x8-psimd-splat.c
diff --git a/src/f32-igemm/1x8-sse-dup.c b/src/f32-igemm/gen/1x8-sse-dup.c
similarity index 100%
rename from src/f32-igemm/1x8-sse-dup.c
rename to src/f32-igemm/gen/1x8-sse-dup.c
diff --git a/src/f32-igemm/1x8-sse-load1.c b/src/f32-igemm/gen/1x8-sse-load1.c
similarity index 100%
rename from src/f32-igemm/1x8-sse-load1.c
rename to src/f32-igemm/gen/1x8-sse-load1.c
diff --git a/src/f32-igemm/1x8s4-neon.c b/src/f32-igemm/gen/1x8s4-neon.c
similarity index 100%
rename from src/f32-igemm/1x8s4-neon.c
rename to src/f32-igemm/gen/1x8s4-neon.c
diff --git a/src/f32-igemm/1x8s4-neonfma.c b/src/f32-igemm/gen/1x8s4-neonfma.c
similarity index 100%
rename from src/f32-igemm/1x8s4-neonfma.c
rename to src/f32-igemm/gen/1x8s4-neonfma.c
diff --git a/src/f32-igemm/1x8s4-psimd.c b/src/f32-igemm/gen/1x8s4-psimd.c
similarity index 100%
rename from src/f32-igemm/1x8s4-psimd.c
rename to src/f32-igemm/gen/1x8s4-psimd.c
diff --git a/src/f32-igemm/1x8s4-sse.c b/src/f32-igemm/gen/1x8s4-sse.c
similarity index 100%
rename from src/f32-igemm/1x8s4-sse.c
rename to src/f32-igemm/gen/1x8s4-sse.c
diff --git a/src/f32-igemm/2x4-scalar.c b/src/f32-igemm/gen/2x4-scalar.c
similarity index 100%
rename from src/f32-igemm/2x4-scalar.c
rename to src/f32-igemm/gen/2x4-scalar.c
diff --git a/src/f32-igemm/4x2-neon-lane-ld64.c b/src/f32-igemm/gen/4x2-neon-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/4x2-neon-lane-ld64.c
rename to src/f32-igemm/gen/4x2-neon-lane-ld64.c
diff --git a/src/f32-igemm/4x2-neonfma-lane-ld64.c b/src/f32-igemm/gen/4x2-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/4x2-neonfma-lane-ld64.c
rename to src/f32-igemm/gen/4x2-neonfma-lane-ld64.c
diff --git a/src/f32-igemm/4x2-scalar.c b/src/f32-igemm/gen/4x2-scalar.c
similarity index 100%
rename from src/f32-igemm/4x2-scalar.c
rename to src/f32-igemm/gen/4x2-scalar.c
diff --git a/src/f32-igemm/4x2c4-psimd.c b/src/f32-igemm/gen/4x2c4-psimd.c
similarity index 100%
rename from src/f32-igemm/4x2c4-psimd.c
rename to src/f32-igemm/gen/4x2c4-psimd.c
diff --git a/src/f32-igemm/4x2c4-sse.c b/src/f32-igemm/gen/4x2c4-sse.c
similarity index 100%
rename from src/f32-igemm/4x2c4-sse.c
rename to src/f32-igemm/gen/4x2c4-sse.c
diff --git a/src/f32-igemm/4x4-neon-lane-ld64.c b/src/f32-igemm/gen/4x4-neon-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/4x4-neon-lane-ld64.c
rename to src/f32-igemm/gen/4x4-neon-lane-ld64.c
diff --git a/src/f32-igemm/4x4-neonfma-lane-ld64.c b/src/f32-igemm/gen/4x4-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/4x4-neonfma-lane-ld64.c
rename to src/f32-igemm/gen/4x4-neonfma-lane-ld64.c
diff --git a/src/f32-igemm/4x4-scalar.c b/src/f32-igemm/gen/4x4-scalar.c
similarity index 100%
rename from src/f32-igemm/4x4-scalar.c
rename to src/f32-igemm/gen/4x4-scalar.c
diff --git a/src/f32-igemm/4x8-avx-broadcast.c b/src/f32-igemm/gen/4x8-avx-broadcast.c
similarity index 100%
rename from src/f32-igemm/4x8-avx-broadcast.c
rename to src/f32-igemm/gen/4x8-avx-broadcast.c
diff --git a/src/f32-igemm/4x8-fma3-broadcast.c b/src/f32-igemm/gen/4x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-igemm/4x8-fma3-broadcast.c
rename to src/f32-igemm/gen/4x8-fma3-broadcast.c
diff --git a/src/f32-igemm/4x8-neon-dup-ld128.c b/src/f32-igemm/gen/4x8-neon-dup-ld128.c
similarity index 100%
rename from src/f32-igemm/4x8-neon-dup-ld128.c
rename to src/f32-igemm/gen/4x8-neon-dup-ld128.c
diff --git a/src/f32-igemm/4x8-neon-dup-ld64.c b/src/f32-igemm/gen/4x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-igemm/4x8-neon-dup-ld64.c
rename to src/f32-igemm/gen/4x8-neon-dup-ld64.c
diff --git a/src/f32-igemm/4x8-neon-lane-ld128.c b/src/f32-igemm/gen/4x8-neon-lane-ld128.c
similarity index 100%
rename from src/f32-igemm/4x8-neon-lane-ld128.c
rename to src/f32-igemm/gen/4x8-neon-lane-ld128.c
diff --git a/src/f32-igemm/4x8-neon-lane-ld64.c b/src/f32-igemm/gen/4x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/4x8-neon-lane-ld64.c
rename to src/f32-igemm/gen/4x8-neon-lane-ld64.c
diff --git a/src/f32-igemm/4x8-neonfma-dup-ld128.c b/src/f32-igemm/gen/4x8-neonfma-dup-ld128.c
similarity index 100%
rename from src/f32-igemm/4x8-neonfma-dup-ld128.c
rename to src/f32-igemm/gen/4x8-neonfma-dup-ld128.c
diff --git a/src/f32-igemm/4x8-neonfma-dup-ld64.c b/src/f32-igemm/gen/4x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-igemm/4x8-neonfma-dup-ld64.c
rename to src/f32-igemm/gen/4x8-neonfma-dup-ld64.c
diff --git a/src/f32-igemm/4x8-neonfma-lane-ld128.c b/src/f32-igemm/gen/4x8-neonfma-lane-ld128.c
similarity index 100%
rename from src/f32-igemm/4x8-neonfma-lane-ld128.c
rename to src/f32-igemm/gen/4x8-neonfma-lane-ld128.c
diff --git a/src/f32-igemm/4x8-neonfma-lane-ld64.c b/src/f32-igemm/gen/4x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/4x8-neonfma-lane-ld64.c
rename to src/f32-igemm/gen/4x8-neonfma-lane-ld64.c
diff --git a/src/f32-igemm/4x8-psimd-loadsplat.c b/src/f32-igemm/gen/4x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-igemm/4x8-psimd-loadsplat.c
rename to src/f32-igemm/gen/4x8-psimd-loadsplat.c
diff --git a/src/f32-igemm/4x8-psimd-splat.c b/src/f32-igemm/gen/4x8-psimd-splat.c
similarity index 100%
rename from src/f32-igemm/4x8-psimd-splat.c
rename to src/f32-igemm/gen/4x8-psimd-splat.c
diff --git a/src/f32-igemm/4x8-sse-dup.c b/src/f32-igemm/gen/4x8-sse-dup.c
similarity index 100%
rename from src/f32-igemm/4x8-sse-dup.c
rename to src/f32-igemm/gen/4x8-sse-dup.c
diff --git a/src/f32-igemm/4x8-sse-load1.c b/src/f32-igemm/gen/4x8-sse-load1.c
similarity index 100%
rename from src/f32-igemm/4x8-sse-load1.c
rename to src/f32-igemm/gen/4x8-sse-load1.c
diff --git a/src/f32-igemm/4x8s4-neon.c b/src/f32-igemm/gen/4x8s4-neon.c
similarity index 100%
rename from src/f32-igemm/4x8s4-neon.c
rename to src/f32-igemm/gen/4x8s4-neon.c
diff --git a/src/f32-igemm/4x8s4-neonfma.c b/src/f32-igemm/gen/4x8s4-neonfma.c
similarity index 100%
rename from src/f32-igemm/4x8s4-neonfma.c
rename to src/f32-igemm/gen/4x8s4-neonfma.c
diff --git a/src/f32-igemm/4x8s4-psimd.c b/src/f32-igemm/gen/4x8s4-psimd.c
similarity index 100%
rename from src/f32-igemm/4x8s4-psimd.c
rename to src/f32-igemm/gen/4x8s4-psimd.c
diff --git a/src/f32-igemm/4x8s4-sse.c b/src/f32-igemm/gen/4x8s4-sse.c
similarity index 100%
rename from src/f32-igemm/4x8s4-sse.c
rename to src/f32-igemm/gen/4x8s4-sse.c
diff --git a/src/f32-igemm/5x8-avx-broadcast.c b/src/f32-igemm/gen/5x8-avx-broadcast.c
similarity index 100%
rename from src/f32-igemm/5x8-avx-broadcast.c
rename to src/f32-igemm/gen/5x8-avx-broadcast.c
diff --git a/src/f32-igemm/5x8-fma3-broadcast.c b/src/f32-igemm/gen/5x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-igemm/5x8-fma3-broadcast.c
rename to src/f32-igemm/gen/5x8-fma3-broadcast.c
diff --git a/src/f32-igemm/6x8-avx-broadcast.c b/src/f32-igemm/gen/6x8-avx-broadcast.c
similarity index 100%
rename from src/f32-igemm/6x8-avx-broadcast.c
rename to src/f32-igemm/gen/6x8-avx-broadcast.c
diff --git a/src/f32-igemm/6x8-fma3-broadcast.c b/src/f32-igemm/gen/6x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-igemm/6x8-fma3-broadcast.c
rename to src/f32-igemm/gen/6x8-fma3-broadcast.c
diff --git a/src/f32-igemm/6x8-neon-dup-ld64.c b/src/f32-igemm/gen/6x8-neon-dup-ld64.c
similarity index 100%
rename from src/f32-igemm/6x8-neon-dup-ld64.c
rename to src/f32-igemm/gen/6x8-neon-dup-ld64.c
diff --git a/src/f32-igemm/6x8-neon-lane-ld64.c b/src/f32-igemm/gen/6x8-neon-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/6x8-neon-lane-ld64.c
rename to src/f32-igemm/gen/6x8-neon-lane-ld64.c
diff --git a/src/f32-igemm/6x8-neonfma-dup-ld64.c b/src/f32-igemm/gen/6x8-neonfma-dup-ld64.c
similarity index 100%
rename from src/f32-igemm/6x8-neonfma-dup-ld64.c
rename to src/f32-igemm/gen/6x8-neonfma-dup-ld64.c
diff --git a/src/f32-igemm/6x8-neonfma-lane-ld64.c b/src/f32-igemm/gen/6x8-neonfma-lane-ld64.c
similarity index 100%
rename from src/f32-igemm/6x8-neonfma-lane-ld64.c
rename to src/f32-igemm/gen/6x8-neonfma-lane-ld64.c
diff --git a/src/f32-igemm/6x8-psimd-loadsplat.c b/src/f32-igemm/gen/6x8-psimd-loadsplat.c
similarity index 100%
rename from src/f32-igemm/6x8-psimd-loadsplat.c
rename to src/f32-igemm/gen/6x8-psimd-loadsplat.c
diff --git a/src/f32-igemm/6x8-psimd-splat.c b/src/f32-igemm/gen/6x8-psimd-splat.c
similarity index 100%
rename from src/f32-igemm/6x8-psimd-splat.c
rename to src/f32-igemm/gen/6x8-psimd-splat.c
diff --git a/src/f32-igemm/6x8s4-neon.c b/src/f32-igemm/gen/6x8s4-neon.c
similarity index 100%
rename from src/f32-igemm/6x8s4-neon.c
rename to src/f32-igemm/gen/6x8s4-neon.c
diff --git a/src/f32-igemm/6x8s4-neonfma.c b/src/f32-igemm/gen/6x8s4-neonfma.c
similarity index 100%
rename from src/f32-igemm/6x8s4-neonfma.c
rename to src/f32-igemm/gen/6x8s4-neonfma.c
diff --git a/src/f32-igemm/6x8s4-psimd.c b/src/f32-igemm/gen/6x8s4-psimd.c
similarity index 100%
rename from src/f32-igemm/6x8s4-psimd.c
rename to src/f32-igemm/gen/6x8s4-psimd.c
diff --git a/src/f32-igemm/7x8-avx-broadcast.c b/src/f32-igemm/gen/7x8-avx-broadcast.c
similarity index 100%
rename from src/f32-igemm/7x8-avx-broadcast.c
rename to src/f32-igemm/gen/7x8-avx-broadcast.c
diff --git a/src/f32-igemm/7x8-fma3-broadcast.c b/src/f32-igemm/gen/7x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-igemm/7x8-fma3-broadcast.c
rename to src/f32-igemm/gen/7x8-fma3-broadcast.c
diff --git a/src/f32-igemm/8x8-fma3-broadcast.c b/src/f32-igemm/gen/8x8-fma3-broadcast.c
similarity index 100%
rename from src/f32-igemm/8x8-fma3-broadcast.c
rename to src/f32-igemm/gen/8x8-fma3-broadcast.c
diff --git a/src/f32-igemm/8x8s4-neon.c b/src/f32-igemm/gen/8x8s4-neon.c
similarity index 100%
rename from src/f32-igemm/8x8s4-neon.c
rename to src/f32-igemm/gen/8x8s4-neon.c
diff --git a/src/f32-igemm/8x8s4-neonfma.c b/src/f32-igemm/gen/8x8s4-neonfma.c
similarity index 100%
rename from src/f32-igemm/8x8s4-neonfma.c
rename to src/f32-igemm/gen/8x8s4-neonfma.c
diff --git a/src/f32-ppmm/2x4-scalar.c b/src/f32-ppmm/gen/2x4-scalar.c
similarity index 100%
rename from src/f32-ppmm/2x4-scalar.c
rename to src/f32-ppmm/gen/2x4-scalar.c
diff --git a/src/f32-ppmm/3x3-scalar.c b/src/f32-ppmm/gen/3x3-scalar.c
similarity index 100%
rename from src/f32-ppmm/3x3-scalar.c
rename to src/f32-ppmm/gen/3x3-scalar.c
diff --git a/src/f32-ppmm/4x2-scalar.c b/src/f32-ppmm/gen/4x2-scalar.c
similarity index 100%
rename from src/f32-ppmm/4x2-scalar.c
rename to src/f32-ppmm/gen/4x2-scalar.c
diff --git a/src/f32-ppmm/4x4-scalar.c b/src/f32-ppmm/gen/4x4-scalar.c
similarity index 100%
rename from src/f32-ppmm/4x4-scalar.c
rename to src/f32-ppmm/gen/4x4-scalar.c
diff --git a/src/f32-ppmm/4x8-neon.c b/src/f32-ppmm/gen/4x8-neon.c
similarity index 100%
rename from src/f32-ppmm/4x8-neon.c
rename to src/f32-ppmm/gen/4x8-neon.c
diff --git a/src/f32-ppmm/4x8-neonfma.c b/src/f32-ppmm/gen/4x8-neonfma.c
similarity index 100%
rename from src/f32-ppmm/4x8-neonfma.c
rename to src/f32-ppmm/gen/4x8-neonfma.c
diff --git a/src/f32-ppmm/4x8-psimd.c b/src/f32-ppmm/gen/4x8-psimd.c
similarity index 100%
rename from src/f32-ppmm/4x8-psimd.c
rename to src/f32-ppmm/gen/4x8-psimd.c
diff --git a/src/f32-ppmm/4x8-sse.c b/src/f32-ppmm/gen/4x8-sse.c
similarity index 100%
rename from src/f32-ppmm/4x8-sse.c
rename to src/f32-ppmm/gen/4x8-sse.c
diff --git a/src/f32-ppmm/8x8-neon.c b/src/f32-ppmm/gen/8x8-neon.c
similarity index 100%
rename from src/f32-ppmm/8x8-neon.c
rename to src/f32-ppmm/gen/8x8-neon.c
diff --git a/src/f32-ppmm/8x8-neonfma.c b/src/f32-ppmm/gen/8x8-neonfma.c
similarity index 100%
rename from src/f32-ppmm/8x8-neonfma.c
rename to src/f32-ppmm/gen/8x8-neonfma.c
diff --git a/src/f32-prelu/neon-2x4.c b/src/f32-prelu/gen/neon-2x4.c
similarity index 100%
rename from src/f32-prelu/neon-2x4.c
rename to src/f32-prelu/gen/neon-2x4.c
diff --git a/src/f32-prelu/neon-2x8.c b/src/f32-prelu/gen/neon-2x8.c
similarity index 100%
rename from src/f32-prelu/neon-2x8.c
rename to src/f32-prelu/gen/neon-2x8.c
diff --git a/src/f32-prelu/psimd-2x4.c b/src/f32-prelu/gen/psimd-2x4.c
similarity index 100%
rename from src/f32-prelu/psimd-2x4.c
rename to src/f32-prelu/gen/psimd-2x4.c
diff --git a/src/f32-prelu/psimd-2x8.c b/src/f32-prelu/gen/psimd-2x8.c
similarity index 100%
rename from src/f32-prelu/psimd-2x8.c
rename to src/f32-prelu/gen/psimd-2x8.c
diff --git a/src/f32-prelu/scalar-2x1.c b/src/f32-prelu/gen/scalar-2x1.c
similarity index 100%
rename from src/f32-prelu/scalar-2x1.c
rename to src/f32-prelu/gen/scalar-2x1.c
diff --git a/src/f32-prelu/scalar-2x4.c b/src/f32-prelu/gen/scalar-2x4.c
similarity index 100%
rename from src/f32-prelu/scalar-2x4.c
rename to src/f32-prelu/gen/scalar-2x4.c
diff --git a/src/f32-prelu/sse2-2x4.c b/src/f32-prelu/gen/sse2-2x4.c
similarity index 100%
rename from src/f32-prelu/sse2-2x4.c
rename to src/f32-prelu/gen/sse2-2x4.c
diff --git a/src/f32-prelu/sse2-2x8.c b/src/f32-prelu/gen/sse2-2x8.c
similarity index 100%
rename from src/f32-prelu/sse2-2x8.c
rename to src/f32-prelu/gen/sse2-2x8.c
diff --git a/src/f32-prelu/sse41-2x4.c b/src/f32-prelu/gen/sse41-2x4.c
similarity index 100%
rename from src/f32-prelu/sse41-2x4.c
rename to src/f32-prelu/gen/sse41-2x4.c
diff --git a/src/f32-prelu/sse41-2x8.c b/src/f32-prelu/gen/sse41-2x8.c
similarity index 100%
rename from src/f32-prelu/sse41-2x8.c
rename to src/f32-prelu/gen/sse41-2x8.c
diff --git a/src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c b/src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
similarity index 100%
rename from src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
rename to src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
diff --git a/src/f32-sigmoid/neonfma-p5-nr2fma-x16.c b/src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
similarity index 100%
rename from src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
rename to src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
diff --git a/src/f32-sigmoid/sse2-p5-div-x16.c b/src/f32-sigmoid/gen/sse2-p5-div-x16.c
similarity index 100%
rename from src/f32-sigmoid/sse2-p5-div-x16.c
rename to src/f32-sigmoid/gen/sse2-p5-div-x16.c
diff --git a/src/f32-sigmoid/sse2-p5-div-x8.c b/src/f32-sigmoid/gen/sse2-p5-div-x8.c
similarity index 100%
rename from src/f32-sigmoid/sse2-p5-div-x8.c
rename to src/f32-sigmoid/gen/sse2-p5-div-x8.c
diff --git a/src/f32-sigmoid/sse41-p5-div-x16.c b/src/f32-sigmoid/gen/sse41-p5-div-x16.c
similarity index 100%
rename from src/f32-sigmoid/sse41-p5-div-x16.c
rename to src/f32-sigmoid/gen/sse41-p5-div-x16.c
diff --git a/src/f32-sigmoid/sse41-p5-div-x8.c b/src/f32-sigmoid/gen/sse41-p5-div-x8.c
similarity index 100%
rename from src/f32-sigmoid/sse41-p5-div-x8.c
rename to src/f32-sigmoid/gen/sse41-p5-div-x8.c
diff --git a/src/f32-spmm/12x1-neonfma.c b/src/f32-spmm/gen/12x1-neonfma.c
similarity index 100%
rename from src/f32-spmm/12x1-neonfma.c
rename to src/f32-spmm/gen/12x1-neonfma.c
diff --git a/src/f32-spmm/12x2-neonfma.c b/src/f32-spmm/gen/12x2-neonfma.c
similarity index 100%
rename from src/f32-spmm/12x2-neonfma.c
rename to src/f32-spmm/gen/12x2-neonfma.c
diff --git a/src/f32-spmm/12x4-neonfma.c b/src/f32-spmm/gen/12x4-neonfma.c
similarity index 100%
rename from src/f32-spmm/12x4-neonfma.c
rename to src/f32-spmm/gen/12x4-neonfma.c
diff --git a/src/f32-spmm/16x1-neonfma-pipelined.c b/src/f32-spmm/gen/16x1-neonfma-pipelined.c
similarity index 100%
rename from src/f32-spmm/16x1-neonfma-pipelined.c
rename to src/f32-spmm/gen/16x1-neonfma-pipelined.c
diff --git a/src/f32-spmm/16x1-neonfma-unroll2.c b/src/f32-spmm/gen/16x1-neonfma-unroll2.c
similarity index 100%
rename from src/f32-spmm/16x1-neonfma-unroll2.c
rename to src/f32-spmm/gen/16x1-neonfma-unroll2.c
diff --git a/src/f32-spmm/16x1-neonfma.c b/src/f32-spmm/gen/16x1-neonfma.c
similarity index 100%
rename from src/f32-spmm/16x1-neonfma.c
rename to src/f32-spmm/gen/16x1-neonfma.c
diff --git a/src/f32-spmm/16x2-neonfma.c b/src/f32-spmm/gen/16x2-neonfma.c
similarity index 100%
rename from src/f32-spmm/16x2-neonfma.c
rename to src/f32-spmm/gen/16x2-neonfma.c
diff --git a/src/f32-spmm/16x4-neonfma.c b/src/f32-spmm/gen/16x4-neonfma.c
similarity index 100%
rename from src/f32-spmm/16x4-neonfma.c
rename to src/f32-spmm/gen/16x4-neonfma.c
diff --git a/src/f32-spmm/1x1-scalar-pipelined.c b/src/f32-spmm/gen/1x1-scalar-pipelined.c
similarity index 100%
rename from src/f32-spmm/1x1-scalar-pipelined.c
rename to src/f32-spmm/gen/1x1-scalar-pipelined.c
diff --git a/src/f32-spmm/1x1-scalar.c b/src/f32-spmm/gen/1x1-scalar.c
similarity index 100%
rename from src/f32-spmm/1x1-scalar.c
rename to src/f32-spmm/gen/1x1-scalar.c
diff --git a/src/f32-spmm/2x1-scalar-pipelined.c b/src/f32-spmm/gen/2x1-scalar-pipelined.c
similarity index 100%
rename from src/f32-spmm/2x1-scalar-pipelined.c
rename to src/f32-spmm/gen/2x1-scalar-pipelined.c
diff --git a/src/f32-spmm/2x1-scalar.c b/src/f32-spmm/gen/2x1-scalar.c
similarity index 100%
rename from src/f32-spmm/2x1-scalar.c
rename to src/f32-spmm/gen/2x1-scalar.c
diff --git a/src/f32-spmm/4x1-neonfma-pipelined.c b/src/f32-spmm/gen/4x1-neonfma-pipelined.c
similarity index 100%
rename from src/f32-spmm/4x1-neonfma-pipelined.c
rename to src/f32-spmm/gen/4x1-neonfma-pipelined.c
diff --git a/src/f32-spmm/4x1-neonfma-unroll2.c b/src/f32-spmm/gen/4x1-neonfma-unroll2.c
similarity index 100%
rename from src/f32-spmm/4x1-neonfma-unroll2.c
rename to src/f32-spmm/gen/4x1-neonfma-unroll2.c
diff --git a/src/f32-spmm/4x1-neonfma.c b/src/f32-spmm/gen/4x1-neonfma.c
similarity index 100%
rename from src/f32-spmm/4x1-neonfma.c
rename to src/f32-spmm/gen/4x1-neonfma.c
diff --git a/src/f32-spmm/4x1-scalar-pipelined.c b/src/f32-spmm/gen/4x1-scalar-pipelined.c
similarity index 100%
rename from src/f32-spmm/4x1-scalar-pipelined.c
rename to src/f32-spmm/gen/4x1-scalar-pipelined.c
diff --git a/src/f32-spmm/4x1-scalar.c b/src/f32-spmm/gen/4x1-scalar.c
similarity index 100%
rename from src/f32-spmm/4x1-scalar.c
rename to src/f32-spmm/gen/4x1-scalar.c
diff --git a/src/f32-spmm/4x1-sse.c b/src/f32-spmm/gen/4x1-sse.c
similarity index 100%
rename from src/f32-spmm/4x1-sse.c
rename to src/f32-spmm/gen/4x1-sse.c
diff --git a/src/f32-spmm/4x2-neonfma.c b/src/f32-spmm/gen/4x2-neonfma.c
similarity index 100%
rename from src/f32-spmm/4x2-neonfma.c
rename to src/f32-spmm/gen/4x2-neonfma.c
diff --git a/src/f32-spmm/4x4-neonfma.c b/src/f32-spmm/gen/4x4-neonfma.c
similarity index 100%
rename from src/f32-spmm/4x4-neonfma.c
rename to src/f32-spmm/gen/4x4-neonfma.c
diff --git a/src/f32-spmm/8x1-neonfma-pipelined.c b/src/f32-spmm/gen/8x1-neonfma-pipelined.c
similarity index 100%
rename from src/f32-spmm/8x1-neonfma-pipelined.c
rename to src/f32-spmm/gen/8x1-neonfma-pipelined.c
diff --git a/src/f32-spmm/8x1-neonfma-unroll2.c b/src/f32-spmm/gen/8x1-neonfma-unroll2.c
similarity index 100%
rename from src/f32-spmm/8x1-neonfma-unroll2.c
rename to src/f32-spmm/gen/8x1-neonfma-unroll2.c
diff --git a/src/f32-spmm/8x1-neonfma.c b/src/f32-spmm/gen/8x1-neonfma.c
similarity index 100%
rename from src/f32-spmm/8x1-neonfma.c
rename to src/f32-spmm/gen/8x1-neonfma.c
diff --git a/src/f32-spmm/8x1-scalar-pipelined.c b/src/f32-spmm/gen/8x1-scalar-pipelined.c
similarity index 100%
rename from src/f32-spmm/8x1-scalar-pipelined.c
rename to src/f32-spmm/gen/8x1-scalar-pipelined.c
diff --git a/src/f32-spmm/8x1-scalar.c b/src/f32-spmm/gen/8x1-scalar.c
similarity index 100%
rename from src/f32-spmm/8x1-scalar.c
rename to src/f32-spmm/gen/8x1-scalar.c
diff --git a/src/f32-spmm/8x1-sse.c b/src/f32-spmm/gen/8x1-sse.c
similarity index 100%
rename from src/f32-spmm/8x1-sse.c
rename to src/f32-spmm/gen/8x1-sse.c
diff --git a/src/f32-spmm/8x2-neonfma.c b/src/f32-spmm/gen/8x2-neonfma.c
similarity index 100%
rename from src/f32-spmm/8x2-neonfma.c
rename to src/f32-spmm/gen/8x2-neonfma.c
diff --git a/src/f32-spmm/8x2-scalar.c b/src/f32-spmm/gen/8x2-scalar.c
similarity index 100%
rename from src/f32-spmm/8x2-scalar.c
rename to src/f32-spmm/gen/8x2-scalar.c
diff --git a/src/f32-spmm/8x4-neonfma.c b/src/f32-spmm/gen/8x4-neonfma.c
similarity index 100%
rename from src/f32-spmm/8x4-neonfma.c
rename to src/f32-spmm/gen/8x4-neonfma.c
diff --git a/src/f32-spmm/8x4-scalar.c b/src/f32-spmm/gen/8x4-scalar.c
similarity index 100%
rename from src/f32-spmm/8x4-scalar.c
rename to src/f32-spmm/gen/8x4-scalar.c
diff --git a/src/f32-vadd/psimd.c b/src/f32-vadd/psimd.c
deleted file mode 100644
index e74097f..0000000
--- a/src/f32-vadd/psimd.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/vadd.h>
-
-
-void xnn_f32_vadd_ukernel__psimd(
-    size_t n,
-    const float* a,
-    const float* b,
-    float* y,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const psimd_f32 vy_min = psimd_load_splat_f32(&params->scalar.min);
-  const psimd_f32 vy_max = psimd_load_splat_f32(&params->scalar.max);
-
-  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const psimd_f32 va0 = psimd_load_f32(a);
-    const psimd_f32 va1 = psimd_load_f32(a + 4);
-    a += 8;
-
-    const psimd_f32 vb0 = psimd_load_f32(b);
-    const psimd_f32 vb1 = psimd_load_f32(b + 4);
-    b += 8;
-
-    const psimd_f32 vacc0 = psimd_add_f32(va0, vb0);
-    const psimd_f32 vacc1 = psimd_add_f32(va1, vb1);
-    const psimd_f32 vy0 = psimd_min_f32(psimd_max_f32(vacc0, vy_min), vy_max);
-    const psimd_f32 vy1 = psimd_min_f32(psimd_max_f32(vacc1, vy_min), vy_max);
-
-    psimd_store_f32(y, vy0);
-    psimd_store_f32(y + 4, vy1);
-    y += 8;
-  }
-  if (n >= 4 * sizeof(float)) {
-    const psimd_f32 va = psimd_load_f32(a);
-    a += 4;
-    const psimd_f32 vb = psimd_load_f32(b);
-    b += 4;
-    const psimd_f32 vacc = psimd_add_f32(va, vb);
-    const psimd_f32 vy = psimd_min_f32(psimd_max_f32(vacc, vy_min), vy_max);
-    psimd_store_f32(y, vy);
-    y += 4;
-    n -= 4 * sizeof(float);
-  }
-  if (n != 0) {
-    const psimd_f32 va = psimd_load_f32(a);
-    const psimd_f32 vb = psimd_load_f32(b);
-    const psimd_f32 vacc = psimd_add_f32(va, vb);
-    psimd_f32 vy = psimd_min_f32(psimd_max_f32(vacc, vy_min), vy_max);
-    if (n & 2 * sizeof(float)) {
-      psimd_store2_f32(y, vy);
-      vy = psimd_concat_hi_f32(vy, vy);
-      y += 2;
-    }
-    if (n & 1 * sizeof(float)) {
-      psimd_store1_f32(y, vy);
-    }
-  }
-}
diff --git a/src/f32-vadd/scalar.c b/src/f32-vadd/scalar.c
deleted file mode 100644
index 4b209d2..0000000
--- a/src/f32-vadd/scalar.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/math.h>
-#include <xnnpack/vadd.h>
-
-
-void xnn_f32_vadd_ukernel__scalar(
-    size_t n,
-    const float* a,
-    const float* b,
-    float* y,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const float vy_min = params->scalar.min;
-  const float vy_max = params->scalar.max;
-
-  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
-    const float va0 = a[0];
-    const float va1 = a[1];
-    a += 2;
-
-    const float vb0 = b[0];
-    const float vb1 = b[1];
-    b += 2;
-
-    float vy0 = va0 + vb0;
-    float vy1 = va1 + vb1;
-    vy0 = math_max_f32(vy0, vy_min);
-    vy1 = math_max_f32(vy1, vy_min);
-    vy0 = math_min_f32(vy0, vy_max);
-    vy1 = math_min_f32(vy1, vy_max);
-
-    y[0] = vy0;
-    y[1] = vy1;
-    y += 2;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    const float va = *a;
-    const float vb = *b;
-    float vy = va + vb;
-    vy = math_max_f32(vy, vy_min);
-    vy = math_min_f32(vy, vy_max);
-    *y = vy;
-  }
-}
diff --git a/src/f32-vadd/sse.c b/src/f32-vadd/sse.c
deleted file mode 100644
index f65ce63..0000000
--- a/src/f32-vadd/sse.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xmmintrin.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/vadd.h>
-
-
-void xnn_f32_vadd_ukernel__sse(
-    size_t n,
-    const float* a,
-    const float* b,
-    float* y,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(n != 0);
-  assert(n % sizeof(float) == 0);
-
-  const __m128 vy_min = _mm_load_ps(params->sse.min);
-  const __m128 vy_max = _mm_load_ps(params->sse.max);
-
-  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
-    const __m128 va0 = _mm_loadu_ps(a);
-    const __m128 va1 = _mm_loadu_ps(a + 4);
-    a += 8;
-
-    const __m128 vb0 = _mm_loadu_ps(b);
-    const __m128 vb1 = _mm_loadu_ps(b + 4);
-    b += 8;
-
-    const __m128 vacc0 = _mm_add_ps(va0, vb0);
-    const __m128 vacc1 = _mm_add_ps(va1, vb1);
-    const __m128 vy0 = _mm_min_ps(_mm_max_ps(vacc0, vy_min), vy_max);
-    const __m128 vy1 = _mm_min_ps(_mm_max_ps(vacc1, vy_min), vy_max);
-
-    _mm_storeu_ps(y, vy0);
-    _mm_storeu_ps(y + 4, vy1);
-    y += 8;
-  }
-  if (n >= 4 * sizeof(float)) {
-    const __m128 va = _mm_loadu_ps(a);
-    a += 4;
-    const __m128 vb = _mm_loadu_ps(b);
-    b += 4;
-    const __m128 vacc = _mm_add_ps(va, vb);
-    const __m128 vy = _mm_min_ps(_mm_max_ps(vacc, vy_min), vy_max);
-    _mm_storeu_ps(y, vy);
-    y += 4;
-    n -= 4 * sizeof(float);
-  }
-  if (n != 0) {
-    const __m128 va = _mm_loadu_ps(a);
-    const __m128 vb = _mm_loadu_ps(b);
-    const __m128 vacc = _mm_add_ps(va, vb);
-    __m128 vy = _mm_min_ps(_mm_max_ps(vacc, vy_min), vy_max);
-    if (n & 2 * sizeof(float)) {
-      _mm_storel_pi((__m64*) y, vy);
-      vy = _mm_movehl_ps(vy, vy);
-      y += 2;
-    }
-    if (n & 1 * sizeof(float)) {
-      _mm_store_ss(y, vy);
-    }
-  }
-}
diff --git a/src/f32-vbinary/vadd-neon-x4.c b/src/f32-vbinary/gen/vadd-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vadd-neon-x4.c
rename to src/f32-vbinary/gen/vadd-neon-x4.c
index 9e74662..9cde405 100644
--- a/src/f32-vbinary/vadd-neon-x4.c
+++ b/src/f32-vbinary/gen/vadd-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-neon.c.in
+//   Template: src/f32-vbinary/vop-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-neon-x8.c b/src/f32-vbinary/gen/vadd-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vadd-neon-x8.c
rename to src/f32-vbinary/gen/vadd-neon-x8.c
index fffdcb7..ec8dff6 100644
--- a/src/f32-vbinary/vadd-neon-x8.c
+++ b/src/f32-vbinary/gen/vadd-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-neon.c.in
+//   Template: src/f32-vbinary/vop-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-psimd-x4.c b/src/f32-vbinary/gen/vadd-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vadd-psimd-x4.c
rename to src/f32-vbinary/gen/vadd-psimd-x4.c
index 8c98c88..64f902c 100644
--- a/src/f32-vbinary/vadd-psimd-x4.c
+++ b/src/f32-vbinary/gen/vadd-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-psimd.c.in
+//   Template: src/f32-vbinary/vop-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-psimd-x8.c b/src/f32-vbinary/gen/vadd-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vadd-psimd-x8.c
rename to src/f32-vbinary/gen/vadd-psimd-x8.c
index 41d8e90..2b064aa 100644
--- a/src/f32-vbinary/vadd-psimd-x8.c
+++ b/src/f32-vbinary/gen/vadd-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-psimd.c.in
+//   Template: src/f32-vbinary/vop-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-scalar-x1.c b/src/f32-vbinary/gen/vadd-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vadd-scalar-x1.c
rename to src/f32-vbinary/gen/vadd-scalar-x1.c
index 11e9031..6cbfb32 100644
--- a/src/f32-vbinary/vadd-scalar-x1.c
+++ b/src/f32-vbinary/gen/vadd-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-scalar-x2.c b/src/f32-vbinary/gen/vadd-scalar-x2.c
similarity index 96%
rename from src/f32-vbinary/vadd-scalar-x2.c
rename to src/f32-vbinary/gen/vadd-scalar-x2.c
index 25e21f3..a493be4 100644
--- a/src/f32-vbinary/vadd-scalar-x2.c
+++ b/src/f32-vbinary/gen/vadd-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-scalar-x4.c b/src/f32-vbinary/gen/vadd-scalar-x4.c
similarity index 97%
rename from src/f32-vbinary/vadd-scalar-x4.c
rename to src/f32-vbinary/gen/vadd-scalar-x4.c
index 761bbbe..6375517 100644
--- a/src/f32-vbinary/vadd-scalar-x4.c
+++ b/src/f32-vbinary/gen/vadd-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-sse-x4.c b/src/f32-vbinary/gen/vadd-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vadd-sse-x4.c
rename to src/f32-vbinary/gen/vadd-sse-x4.c
index a73e9e6..a90490d 100644
--- a/src/f32-vbinary/vadd-sse-x4.c
+++ b/src/f32-vbinary/gen/vadd-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-sse.c.in
+//   Template: src/f32-vbinary/vop-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vadd-sse-x8.c b/src/f32-vbinary/gen/vadd-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vadd-sse-x8.c
rename to src/f32-vbinary/gen/vadd-sse-x8.c
index 0e58308..d399297 100644
--- a/src/f32-vbinary/vadd-sse-x8.c
+++ b/src/f32-vbinary/gen/vadd-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-sse.c.in
+//   Template: src/f32-vbinary/vop-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-neon-x4.c b/src/f32-vbinary/gen/vaddc-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vaddc-neon-x4.c
rename to src/f32-vbinary/gen/vaddc-neon-x4.c
index 8c911de..f347572 100644
--- a/src/f32-vbinary/vaddc-neon-x4.c
+++ b/src/f32-vbinary/gen/vaddc-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-neon-x8.c b/src/f32-vbinary/gen/vaddc-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vaddc-neon-x8.c
rename to src/f32-vbinary/gen/vaddc-neon-x8.c
index 907a7e5..9cd1363 100644
--- a/src/f32-vbinary/vaddc-neon-x8.c
+++ b/src/f32-vbinary/gen/vaddc-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-psimd-x4.c b/src/f32-vbinary/gen/vaddc-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vaddc-psimd-x4.c
rename to src/f32-vbinary/gen/vaddc-psimd-x4.c
index ac68b36..c29922e 100644
--- a/src/f32-vbinary/vaddc-psimd-x4.c
+++ b/src/f32-vbinary/gen/vaddc-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-psimd-x8.c b/src/f32-vbinary/gen/vaddc-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vaddc-psimd-x8.c
rename to src/f32-vbinary/gen/vaddc-psimd-x8.c
index afa7ad1..4f3581e 100644
--- a/src/f32-vbinary/vaddc-psimd-x8.c
+++ b/src/f32-vbinary/gen/vaddc-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-scalar-x1.c b/src/f32-vbinary/gen/vaddc-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vaddc-scalar-x1.c
rename to src/f32-vbinary/gen/vaddc-scalar-x1.c
index d591020..a17e7f3 100644
--- a/src/f32-vbinary/vaddc-scalar-x1.c
+++ b/src/f32-vbinary/gen/vaddc-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-scalar-x2.c b/src/f32-vbinary/gen/vaddc-scalar-x2.c
similarity index 95%
rename from src/f32-vbinary/vaddc-scalar-x2.c
rename to src/f32-vbinary/gen/vaddc-scalar-x2.c
index a41445e..ca6efbb 100644
--- a/src/f32-vbinary/vaddc-scalar-x2.c
+++ b/src/f32-vbinary/gen/vaddc-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-scalar-x4.c b/src/f32-vbinary/gen/vaddc-scalar-x4.c
similarity index 96%
rename from src/f32-vbinary/vaddc-scalar-x4.c
rename to src/f32-vbinary/gen/vaddc-scalar-x4.c
index 92792d9..68834c9 100644
--- a/src/f32-vbinary/vaddc-scalar-x4.c
+++ b/src/f32-vbinary/gen/vaddc-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-sse-x4.c b/src/f32-vbinary/gen/vaddc-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vaddc-sse-x4.c
rename to src/f32-vbinary/gen/vaddc-sse-x4.c
index 4442a01..3899812 100644
--- a/src/f32-vbinary/vaddc-sse-x4.c
+++ b/src/f32-vbinary/gen/vaddc-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vaddc-sse-x8.c b/src/f32-vbinary/gen/vaddc-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vaddc-sse-x8.c
rename to src/f32-vbinary/gen/vaddc-sse-x8.c
index 6580953..86ffed1 100644
--- a/src/f32-vbinary/vaddc-sse-x8.c
+++ b/src/f32-vbinary/gen/vaddc-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-neon-x4.c b/src/f32-vbinary/gen/vmul-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vmul-neon-x4.c
rename to src/f32-vbinary/gen/vmul-neon-x4.c
index 93df938..f7e8b25 100644
--- a/src/f32-vbinary/vmul-neon-x4.c
+++ b/src/f32-vbinary/gen/vmul-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-neon.c.in
+//   Template: src/f32-vbinary/vop-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-neon-x8.c b/src/f32-vbinary/gen/vmul-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vmul-neon-x8.c
rename to src/f32-vbinary/gen/vmul-neon-x8.c
index 7cdb655..f087f7e 100644
--- a/src/f32-vbinary/vmul-neon-x8.c
+++ b/src/f32-vbinary/gen/vmul-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-neon.c.in
+//   Template: src/f32-vbinary/vop-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-psimd-x4.c b/src/f32-vbinary/gen/vmul-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vmul-psimd-x4.c
rename to src/f32-vbinary/gen/vmul-psimd-x4.c
index 7802a34..5b77409 100644
--- a/src/f32-vbinary/vmul-psimd-x4.c
+++ b/src/f32-vbinary/gen/vmul-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-psimd.c.in
+//   Template: src/f32-vbinary/vop-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-psimd-x8.c b/src/f32-vbinary/gen/vmul-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vmul-psimd-x8.c
rename to src/f32-vbinary/gen/vmul-psimd-x8.c
index c51e94e..07e8e1a 100644
--- a/src/f32-vbinary/vmul-psimd-x8.c
+++ b/src/f32-vbinary/gen/vmul-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-psimd.c.in
+//   Template: src/f32-vbinary/vop-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-scalar-x1.c b/src/f32-vbinary/gen/vmul-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vmul-scalar-x1.c
rename to src/f32-vbinary/gen/vmul-scalar-x1.c
index 2f47570..bd22c94 100644
--- a/src/f32-vbinary/vmul-scalar-x1.c
+++ b/src/f32-vbinary/gen/vmul-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-scalar-x2.c b/src/f32-vbinary/gen/vmul-scalar-x2.c
similarity index 96%
rename from src/f32-vbinary/vmul-scalar-x2.c
rename to src/f32-vbinary/gen/vmul-scalar-x2.c
index 317be4f..7cce165 100644
--- a/src/f32-vbinary/vmul-scalar-x2.c
+++ b/src/f32-vbinary/gen/vmul-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-scalar-x4.c b/src/f32-vbinary/gen/vmul-scalar-x4.c
similarity index 97%
rename from src/f32-vbinary/vmul-scalar-x4.c
rename to src/f32-vbinary/gen/vmul-scalar-x4.c
index a840d27..7bc69a4 100644
--- a/src/f32-vbinary/vmul-scalar-x4.c
+++ b/src/f32-vbinary/gen/vmul-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-sse-x4.c b/src/f32-vbinary/gen/vmul-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vmul-sse-x4.c
rename to src/f32-vbinary/gen/vmul-sse-x4.c
index a100481..11d4dd5 100644
--- a/src/f32-vbinary/vmul-sse-x4.c
+++ b/src/f32-vbinary/gen/vmul-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-sse.c.in
+//   Template: src/f32-vbinary/vop-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmul-sse-x8.c b/src/f32-vbinary/gen/vmul-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vmul-sse-x8.c
rename to src/f32-vbinary/gen/vmul-sse-x8.c
index b9849f6..bf066fc 100644
--- a/src/f32-vbinary/vmul-sse-x8.c
+++ b/src/f32-vbinary/gen/vmul-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-sse.c.in
+//   Template: src/f32-vbinary/vop-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-neon-x4.c b/src/f32-vbinary/gen/vmulc-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vmulc-neon-x4.c
rename to src/f32-vbinary/gen/vmulc-neon-x4.c
index 8b4b8d5..a982917 100644
--- a/src/f32-vbinary/vmulc-neon-x4.c
+++ b/src/f32-vbinary/gen/vmulc-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-neon-x8.c b/src/f32-vbinary/gen/vmulc-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vmulc-neon-x8.c
rename to src/f32-vbinary/gen/vmulc-neon-x8.c
index 8b362e5..7f4dcfa 100644
--- a/src/f32-vbinary/vmulc-neon-x8.c
+++ b/src/f32-vbinary/gen/vmulc-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-psimd-x4.c b/src/f32-vbinary/gen/vmulc-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vmulc-psimd-x4.c
rename to src/f32-vbinary/gen/vmulc-psimd-x4.c
index cbb9ae6..8b9402d 100644
--- a/src/f32-vbinary/vmulc-psimd-x4.c
+++ b/src/f32-vbinary/gen/vmulc-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-psimd-x8.c b/src/f32-vbinary/gen/vmulc-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vmulc-psimd-x8.c
rename to src/f32-vbinary/gen/vmulc-psimd-x8.c
index 4a5fbe4..869179f 100644
--- a/src/f32-vbinary/vmulc-psimd-x8.c
+++ b/src/f32-vbinary/gen/vmulc-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-scalar-x1.c b/src/f32-vbinary/gen/vmulc-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vmulc-scalar-x1.c
rename to src/f32-vbinary/gen/vmulc-scalar-x1.c
index b0a485a..4444799 100644
--- a/src/f32-vbinary/vmulc-scalar-x1.c
+++ b/src/f32-vbinary/gen/vmulc-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-scalar-x2.c b/src/f32-vbinary/gen/vmulc-scalar-x2.c
similarity index 95%
rename from src/f32-vbinary/vmulc-scalar-x2.c
rename to src/f32-vbinary/gen/vmulc-scalar-x2.c
index 7d58a74..5648ca6 100644
--- a/src/f32-vbinary/vmulc-scalar-x2.c
+++ b/src/f32-vbinary/gen/vmulc-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-scalar-x4.c b/src/f32-vbinary/gen/vmulc-scalar-x4.c
similarity index 96%
rename from src/f32-vbinary/vmulc-scalar-x4.c
rename to src/f32-vbinary/gen/vmulc-scalar-x4.c
index c8888fc..778317c 100644
--- a/src/f32-vbinary/vmulc-scalar-x4.c
+++ b/src/f32-vbinary/gen/vmulc-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-sse-x4.c b/src/f32-vbinary/gen/vmulc-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vmulc-sse-x4.c
rename to src/f32-vbinary/gen/vmulc-sse-x4.c
index 8dafee9..b967977 100644
--- a/src/f32-vbinary/vmulc-sse-x4.c
+++ b/src/f32-vbinary/gen/vmulc-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vmulc-sse-x8.c b/src/f32-vbinary/gen/vmulc-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vmulc-sse-x8.c
rename to src/f32-vbinary/gen/vmulc-sse-x8.c
index 0f7f892..0690f32 100644
--- a/src/f32-vbinary/vmulc-sse-x8.c
+++ b/src/f32-vbinary/gen/vmulc-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-neon-x4.c b/src/f32-vbinary/gen/vrsubc-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vrsubc-neon-x4.c
rename to src/f32-vbinary/gen/vrsubc-neon-x4.c
index b185e54..2524963 100644
--- a/src/f32-vbinary/vrsubc-neon-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-neon-x8.c b/src/f32-vbinary/gen/vrsubc-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vrsubc-neon-x8.c
rename to src/f32-vbinary/gen/vrsubc-neon-x8.c
index fcaf1d4..5578ecf 100644
--- a/src/f32-vbinary/vrsubc-neon-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-psimd-x4.c b/src/f32-vbinary/gen/vrsubc-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vrsubc-psimd-x4.c
rename to src/f32-vbinary/gen/vrsubc-psimd-x4.c
index 80cf600..7d3855b 100644
--- a/src/f32-vbinary/vrsubc-psimd-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-psimd-x8.c b/src/f32-vbinary/gen/vrsubc-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vrsubc-psimd-x8.c
rename to src/f32-vbinary/gen/vrsubc-psimd-x8.c
index 0518f5f..d53752f 100644
--- a/src/f32-vbinary/vrsubc-psimd-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-scalar-x1.c b/src/f32-vbinary/gen/vrsubc-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vrsubc-scalar-x1.c
rename to src/f32-vbinary/gen/vrsubc-scalar-x1.c
index 338a439..a05c549 100644
--- a/src/f32-vbinary/vrsubc-scalar-x1.c
+++ b/src/f32-vbinary/gen/vrsubc-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-scalar-x2.c b/src/f32-vbinary/gen/vrsubc-scalar-x2.c
similarity index 95%
rename from src/f32-vbinary/vrsubc-scalar-x2.c
rename to src/f32-vbinary/gen/vrsubc-scalar-x2.c
index 2d10e46..90d90ad 100644
--- a/src/f32-vbinary/vrsubc-scalar-x2.c
+++ b/src/f32-vbinary/gen/vrsubc-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-scalar-x4.c b/src/f32-vbinary/gen/vrsubc-scalar-x4.c
similarity index 96%
rename from src/f32-vbinary/vrsubc-scalar-x4.c
rename to src/f32-vbinary/gen/vrsubc-scalar-x4.c
index 03c5923..f256861 100644
--- a/src/f32-vbinary/vrsubc-scalar-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-sse-x4.c b/src/f32-vbinary/gen/vrsubc-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vrsubc-sse-x4.c
rename to src/f32-vbinary/gen/vrsubc-sse-x4.c
index 0cf727e..d26e0be 100644
--- a/src/f32-vbinary/vrsubc-sse-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vrsubc-sse-x8.c b/src/f32-vbinary/gen/vrsubc-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vrsubc-sse-x8.c
rename to src/f32-vbinary/gen/vrsubc-sse-x8.c
index 48137f4..961b4a4 100644
--- a/src/f32-vbinary/vrsubc-sse-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-neon-x4.c b/src/f32-vbinary/gen/vsub-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vsub-neon-x4.c
rename to src/f32-vbinary/gen/vsub-neon-x4.c
index 859aa62..5963361 100644
--- a/src/f32-vbinary/vsub-neon-x4.c
+++ b/src/f32-vbinary/gen/vsub-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-neon.c.in
+//   Template: src/f32-vbinary/vop-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-neon-x8.c b/src/f32-vbinary/gen/vsub-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vsub-neon-x8.c
rename to src/f32-vbinary/gen/vsub-neon-x8.c
index 0bd9fa2..a730352 100644
--- a/src/f32-vbinary/vsub-neon-x8.c
+++ b/src/f32-vbinary/gen/vsub-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-neon.c.in
+//   Template: src/f32-vbinary/vop-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-psimd-x4.c b/src/f32-vbinary/gen/vsub-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vsub-psimd-x4.c
rename to src/f32-vbinary/gen/vsub-psimd-x4.c
index de35ab2..52b6c08 100644
--- a/src/f32-vbinary/vsub-psimd-x4.c
+++ b/src/f32-vbinary/gen/vsub-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-psimd.c.in
+//   Template: src/f32-vbinary/vop-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-psimd-x8.c b/src/f32-vbinary/gen/vsub-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vsub-psimd-x8.c
rename to src/f32-vbinary/gen/vsub-psimd-x8.c
index a0bd614..7fb5039 100644
--- a/src/f32-vbinary/vsub-psimd-x8.c
+++ b/src/f32-vbinary/gen/vsub-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-psimd.c.in
+//   Template: src/f32-vbinary/vop-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-scalar-x1.c b/src/f32-vbinary/gen/vsub-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vsub-scalar-x1.c
rename to src/f32-vbinary/gen/vsub-scalar-x1.c
index 6f88531..21123a9 100644
--- a/src/f32-vbinary/vsub-scalar-x1.c
+++ b/src/f32-vbinary/gen/vsub-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-scalar-x2.c b/src/f32-vbinary/gen/vsub-scalar-x2.c
similarity index 96%
rename from src/f32-vbinary/vsub-scalar-x2.c
rename to src/f32-vbinary/gen/vsub-scalar-x2.c
index b9948ff..ca85ab9 100644
--- a/src/f32-vbinary/vsub-scalar-x2.c
+++ b/src/f32-vbinary/gen/vsub-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-scalar-x4.c b/src/f32-vbinary/gen/vsub-scalar-x4.c
similarity index 97%
rename from src/f32-vbinary/vsub-scalar-x4.c
rename to src/f32-vbinary/gen/vsub-scalar-x4.c
index dca0c77..a9707ef 100644
--- a/src/f32-vbinary/vsub-scalar-x4.c
+++ b/src/f32-vbinary/gen/vsub-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-scalar.c.in
+//   Template: src/f32-vbinary/vop-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-sse-x4.c b/src/f32-vbinary/gen/vsub-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vsub-sse-x4.c
rename to src/f32-vbinary/gen/vsub-sse-x4.c
index d8c23be..4b2eb68 100644
--- a/src/f32-vbinary/vsub-sse-x4.c
+++ b/src/f32-vbinary/gen/vsub-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-sse.c.in
+//   Template: src/f32-vbinary/vop-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsub-sse-x8.c b/src/f32-vbinary/gen/vsub-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vsub-sse-x8.c
rename to src/f32-vbinary/gen/vsub-sse-x8.c
index 0c157a3..8e2cb32 100644
--- a/src/f32-vbinary/vsub-sse-x8.c
+++ b/src/f32-vbinary/gen/vsub-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vop-sse.c.in
+//   Template: src/f32-vbinary/vop-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-neon-x4.c b/src/f32-vbinary/gen/vsubc-neon-x4.c
similarity index 97%
rename from src/f32-vbinary/vsubc-neon-x4.c
rename to src/f32-vbinary/gen/vsubc-neon-x4.c
index eab5204..0e17536 100644
--- a/src/f32-vbinary/vsubc-neon-x4.c
+++ b/src/f32-vbinary/gen/vsubc-neon-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-neon-x8.c b/src/f32-vbinary/gen/vsubc-neon-x8.c
similarity index 97%
rename from src/f32-vbinary/vsubc-neon-x8.c
rename to src/f32-vbinary/gen/vsubc-neon-x8.c
index 18cd847..45fcd04 100644
--- a/src/f32-vbinary/vsubc-neon-x8.c
+++ b/src/f32-vbinary/gen/vsubc-neon-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-neon.c.in
+//   Template: src/f32-vbinary/vopc-neon.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-psimd-x4.c b/src/f32-vbinary/gen/vsubc-psimd-x4.c
similarity index 97%
rename from src/f32-vbinary/vsubc-psimd-x4.c
rename to src/f32-vbinary/gen/vsubc-psimd-x4.c
index 54b4ece..57f52d2 100644
--- a/src/f32-vbinary/vsubc-psimd-x4.c
+++ b/src/f32-vbinary/gen/vsubc-psimd-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-psimd-x8.c b/src/f32-vbinary/gen/vsubc-psimd-x8.c
similarity index 97%
rename from src/f32-vbinary/vsubc-psimd-x8.c
rename to src/f32-vbinary/gen/vsubc-psimd-x8.c
index f393a93..3680ac7 100644
--- a/src/f32-vbinary/vsubc-psimd-x8.c
+++ b/src/f32-vbinary/gen/vsubc-psimd-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-psimd.c.in
+//   Template: src/f32-vbinary/vopc-psimd.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-scalar-x1.c b/src/f32-vbinary/gen/vsubc-scalar-x1.c
similarity index 94%
rename from src/f32-vbinary/vsubc-scalar-x1.c
rename to src/f32-vbinary/gen/vsubc-scalar-x1.c
index 7a346af..2de0cf2 100644
--- a/src/f32-vbinary/vsubc-scalar-x1.c
+++ b/src/f32-vbinary/gen/vsubc-scalar-x1.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-scalar-x2.c b/src/f32-vbinary/gen/vsubc-scalar-x2.c
similarity index 95%
rename from src/f32-vbinary/vsubc-scalar-x2.c
rename to src/f32-vbinary/gen/vsubc-scalar-x2.c
index 3b40715..ef6138a 100644
--- a/src/f32-vbinary/vsubc-scalar-x2.c
+++ b/src/f32-vbinary/gen/vsubc-scalar-x2.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-scalar-x4.c b/src/f32-vbinary/gen/vsubc-scalar-x4.c
similarity index 96%
rename from src/f32-vbinary/vsubc-scalar-x4.c
rename to src/f32-vbinary/gen/vsubc-scalar-x4.c
index 1095de8..c9a1164 100644
--- a/src/f32-vbinary/vsubc-scalar-x4.c
+++ b/src/f32-vbinary/gen/vsubc-scalar-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-scalar.c.in
+//   Template: src/f32-vbinary/vopc-scalar.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-sse-x4.c b/src/f32-vbinary/gen/vsubc-sse-x4.c
similarity index 97%
rename from src/f32-vbinary/vsubc-sse-x4.c
rename to src/f32-vbinary/gen/vsubc-sse-x4.c
index 279e884..a7f8cd4 100644
--- a/src/f32-vbinary/vsubc-sse-x4.c
+++ b/src/f32-vbinary/gen/vsubc-sse-x4.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vbinary/vsubc-sse-x8.c b/src/f32-vbinary/gen/vsubc-sse-x8.c
similarity index 97%
rename from src/f32-vbinary/vsubc-sse-x8.c
rename to src/f32-vbinary/gen/vsubc-sse-x8.c
index caa16e9..3b65fc8 100644
--- a/src/f32-vbinary/vsubc-sse-x8.c
+++ b/src/f32-vbinary/gen/vsubc-sse-x8.c
@@ -1,5 +1,5 @@
 // Auto-generated file. Do not edit!
-//   Template: src/f32-binop/vopc-sse.c.in
+//   Template: src/f32-vbinary/vopc-sse.c.in
 //   Generator: tools/xngen
 //
 // Copyright 2019 Google LLC
diff --git a/src/f32-vmulcaddc/c1-scalar-2x.c b/src/f32-vmulcaddc/gen/c1-scalar-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c1-scalar-2x.c
rename to src/f32-vmulcaddc/gen/c1-scalar-2x.c
diff --git a/src/f32-vmulcaddc/c2-scalar-2x.c b/src/f32-vmulcaddc/gen/c2-scalar-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c2-scalar-2x.c
rename to src/f32-vmulcaddc/gen/c2-scalar-2x.c
diff --git a/src/f32-vmulcaddc/c4-neon-2x.c b/src/f32-vmulcaddc/gen/c4-neon-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c4-neon-2x.c
rename to src/f32-vmulcaddc/gen/c4-neon-2x.c
diff --git a/src/f32-vmulcaddc/c4-neonfma-2x.c b/src/f32-vmulcaddc/gen/c4-neonfma-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c4-neonfma-2x.c
rename to src/f32-vmulcaddc/gen/c4-neonfma-2x.c
diff --git a/src/f32-vmulcaddc/c4-psimd-2x.c b/src/f32-vmulcaddc/gen/c4-psimd-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c4-psimd-2x.c
rename to src/f32-vmulcaddc/gen/c4-psimd-2x.c
diff --git a/src/f32-vmulcaddc/c4-scalar-2x.c b/src/f32-vmulcaddc/gen/c4-scalar-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c4-scalar-2x.c
rename to src/f32-vmulcaddc/gen/c4-scalar-2x.c
diff --git a/src/f32-vmulcaddc/c4-sse-2x.c b/src/f32-vmulcaddc/gen/c4-sse-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c4-sse-2x.c
rename to src/f32-vmulcaddc/gen/c4-sse-2x.c
diff --git a/src/f32-vmulcaddc/c8-neon-2x.c b/src/f32-vmulcaddc/gen/c8-neon-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c8-neon-2x.c
rename to src/f32-vmulcaddc/gen/c8-neon-2x.c
diff --git a/src/f32-vmulcaddc/c8-neonfma-2x.c b/src/f32-vmulcaddc/gen/c8-neonfma-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c8-neonfma-2x.c
rename to src/f32-vmulcaddc/gen/c8-neonfma-2x.c
diff --git a/src/f32-vmulcaddc/c8-psimd-2x.c b/src/f32-vmulcaddc/gen/c8-psimd-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c8-psimd-2x.c
rename to src/f32-vmulcaddc/gen/c8-psimd-2x.c
diff --git a/src/f32-vmulcaddc/c8-sse-2x.c b/src/f32-vmulcaddc/gen/c8-sse-2x.c
similarity index 100%
rename from src/f32-vmulcaddc/c8-sse-2x.c
rename to src/f32-vmulcaddc/gen/c8-sse-2x.c