Rename neon intrinsics to lane.

PiperOrigin-RevId: 282000418
diff --git a/BUILD.bazel b/BUILD.bazel
index 2a6803a..fbbaf09 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -277,41 +277,41 @@
     "src/f32-bilinear/neon-c4.c",
     "src/f32-bilinear/neon-c8.c",
     "src/f32-clamp/neon.c",
-    "src/f32-dwconv/up4x9-neon-acc2.c",
     "src/f32-dwconv/up4x9-neon.c",
-    "src/f32-dwconv/up8x9-neon-acc2.c",
+    "src/f32-dwconv/up4x9-neon-acc2.c",
     "src/f32-dwconv/up8x9-neon.c",
+    "src/f32-dwconv/up8x9-neon-acc2.c",
     "src/f32-gavgpool-spchw/neon-x4.c",
     "src/f32-gavgpool/mp7p7q-neon.c",
     "src/f32-gavgpool/up7-neon.c",
-    "src/f32-gemm/1x8-neon-ld64.c",
+    "src/f32-gemm/1x8-neon-lane-ld64.c",
+    "src/f32-gemm/4x2-neon-lane-ld64.c",
+    "src/f32-gemm/4x8-neon-lane-ld128.c",
+    "src/f32-gemm/4x8-neon-lane-ld64.c",
+    "src/f32-gemm/5x8-neon-lane-ld64.c",
+    "src/f32-gemm/6x8-neon-lane-ld64.c",
     "src/f32-gemm/1x8s4-neon.c",
-    "src/f32-gemm/4x2-neon-ld64.c",
-    "src/f32-gemm/4x8-neon-ld128.c",
-    "src/f32-gemm/4x8-neon-ld64.c",
     "src/f32-gemm/4x8s4-neon.c",
-    "src/f32-gemm/5x8-neon-ld64.c",
-    "src/f32-gemm/6x8-neon-ld64.c",
     "src/f32-gemm/6x8s4-neon.c",
     "src/f32-gemm/8x8s4-neon.c",
-    "src/f32-gemminc/1x8-neon-ld64.c",
+    "src/f32-gemminc/1x8-neon-lane-ld64.c",
+    "src/f32-gemminc/4x8-neon-lane-ld128.c",
+    "src/f32-gemminc/4x8-neon-lane-ld64.c",
+    "src/f32-gemminc/5x8-neon-lane-ld64.c",
+    "src/f32-gemminc/6x8-neon-lane-ld64.c",
     "src/f32-gemminc/1x8s4-neon.c",
-    "src/f32-gemminc/4x8-neon-ld128.c",
-    "src/f32-gemminc/4x8-neon-ld64.c",
     "src/f32-gemminc/4x8s4-neon.c",
-    "src/f32-gemminc/5x8-neon-ld64.c",
-    "src/f32-gemminc/6x8-neon-ld64.c",
     "src/f32-gemminc/6x8s4-neon.c",
     "src/f32-gemminc/8x8s4-neon.c",
     "src/f32-hswish/neon.c",
-    "src/f32-igemm/1x8-neon-ld64.c",
+    "src/f32-igemm/1x8-neon-lane-ld64.c",
+    "src/f32-igemm/4x2-neon-lane-ld64.c",
+    "src/f32-igemm/4x4-neon-lane-ld64.c",
+    "src/f32-igemm/4x8-neon-lane-ld128.c",
+    "src/f32-igemm/4x8-neon-lane-ld64.c",
+    "src/f32-igemm/6x8-neon-lane-ld64.c",
     "src/f32-igemm/1x8s4-neon.c",
-    "src/f32-igemm/4x2-neon-ld64.c",
-    "src/f32-igemm/4x4-neon-ld64.c",
-    "src/f32-igemm/4x8-neon-ld128.c",
-    "src/f32-igemm/4x8-neon-ld64.c",
     "src/f32-igemm/4x8s4-neon.c",
-    "src/f32-igemm/6x8-neon-ld64.c",
     "src/f32-igemm/6x8s4-neon.c",
     "src/f32-igemm/8x8s4-neon.c",
     "src/f32-pavgpool/mp9p8q-neon.c",
@@ -366,12 +366,6 @@
 NEONFMA_UKERNELS = [
     "src/f32-bilinear/neonfma-c4.c",
     "src/f32-bilinear/neonfma-c8.c",
-    "src/f32-igemm/1x8-neonfma-ld64.c",
-    "src/f32-igemm/4x2-neonfma-ld64.c",
-    "src/f32-igemm/4x4-neonfma-ld64.c",
-    "src/f32-igemm/4x8-neonfma-ld128.c",
-    "src/f32-igemm/4x8-neonfma-ld64.c",
-    "src/f32-igemm/6x8-neonfma-ld64.c",
     "src/f32-igemm/1x8s4-neonfma.c",
     "src/f32-igemm/4x8s4-neonfma.c",
     "src/f32-igemm/6x8s4-neonfma.c",
@@ -380,21 +374,10 @@
     "src/f32-dwconv/up4x9-neonfma-acc2.c",
     "src/f32-dwconv/up8x9-neonfma.c",
     "src/f32-dwconv/up8x9-neonfma-acc2.c",
-    "src/f32-gemm/1x8-neonfma-ld64.c",
-    "src/f32-gemm/4x2-neonfma-ld64.c",
-    "src/f32-gemm/4x8-neonfma-ld128.c",
-    "src/f32-gemm/4x8-neonfma-ld64.c",
-    "src/f32-gemm/5x8-neonfma-ld64.c",
-    "src/f32-gemm/6x8-neonfma-ld64.c",
     "src/f32-gemm/1x8s4-neonfma.c",
     "src/f32-gemm/4x8s4-neonfma.c",
     "src/f32-gemm/6x8s4-neonfma.c",
     "src/f32-gemm/8x8s4-neonfma.c",
-    "src/f32-gemminc/1x8-neonfma-ld64.c",
-    "src/f32-gemminc/4x8-neonfma-ld128.c",
-    "src/f32-gemminc/4x8-neonfma-ld64.c",
-    "src/f32-gemminc/5x8-neonfma-ld64.c",
-    "src/f32-gemminc/6x8-neonfma-ld64.c",
     "src/f32-gemminc/1x8s4-neonfma.c",
     "src/f32-gemminc/4x8s4-neonfma.c",
     "src/f32-gemminc/6x8s4-neonfma.c",
@@ -412,6 +395,23 @@
 ]
 
 AARCH64_NEONFMA_UKERNELS = [
+    "src/f32-gemm/1x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/4x2-neonfma-lane-ld64.c",
+    "src/f32-gemm/4x8-neonfma-lane-ld128.c",
+    "src/f32-gemm/4x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/5x8-neonfma-lane-ld64.c",
+    "src/f32-gemm/6x8-neonfma-lane-ld64.c",
+    "src/f32-gemminc/1x8-neonfma-lane-ld64.c",
+    "src/f32-gemminc/4x8-neonfma-lane-ld128.c",
+    "src/f32-gemminc/4x8-neonfma-lane-ld64.c",
+    "src/f32-gemminc/5x8-neonfma-lane-ld64.c",
+    "src/f32-gemminc/6x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/1x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/4x2-neonfma-lane-ld64.c",
+    "src/f32-igemm/4x4-neonfma-lane-ld64.c",
+    "src/f32-igemm/4x8-neonfma-lane-ld128.c",
+    "src/f32-igemm/4x8-neonfma-lane-ld64.c",
+    "src/f32-igemm/6x8-neonfma-lane-ld64.c",
     "src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c",
     "src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c",
     "src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cbd9213..eeee7de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -359,169 +359,169 @@
   src/x32-zip/xm-psimd.c)
 
 SET(XNNPACK_NEON_MICROKERNEL_SRCS
-  src/f32-avgpool/mp9p8q-neon.c
-  src/f32-avgpool/up9-neon.c
-  src/f32-bilinear/neon-c4.c
-  src/f32-bilinear/neon-c8.c
-  src/f32-clamp/neon.c
-  src/f32-dwconv/up4x9-neon-acc2.c
-  src/f32-dwconv/up4x9-neon.c
-  src/f32-dwconv/up8x9-neon-acc2.c
-  src/f32-dwconv/up8x9-neon.c
-  src/f32-gavgpool-spchw/neon-x4.c
-  src/f32-gavgpool/mp7p7q-neon.c
-  src/f32-gavgpool/up7-neon.c
-  src/f32-gemm/1x8-neon-ld64.c
-  src/f32-gemm/1x8s4-neon.c
-  src/f32-gemm/4x2-neon-ld64.c
-  src/f32-gemm/4x8-neon-ld128.c
-  src/f32-gemm/4x8-neon-ld64.c
-  src/f32-gemm/4x8s4-neon.c
-  src/f32-gemm/5x8-neon-ld64.c
-  src/f32-gemm/6x8-neon-ld64.c
-  src/f32-gemm/6x8s4-neon.c
-  src/f32-gemm/8x8s4-neon.c
-  src/f32-gemminc/1x8-neon-ld64.c
-  src/f32-gemminc/1x8s4-neon.c
-  src/f32-gemminc/4x8-neon-ld128.c
-  src/f32-gemminc/4x8-neon-ld64.c
-  src/f32-gemminc/4x8s4-neon.c
-  src/f32-gemminc/5x8-neon-ld64.c
-  src/f32-gemminc/6x8-neon-ld64.c
-  src/f32-gemminc/6x8s4-neon.c
-  src/f32-gemminc/8x8s4-neon.c
-  src/f32-hswish/neon.c
-  src/f32-igemm/1x8-neon-ld64.c
-  src/f32-igemm/1x8s4-neon.c
-  src/f32-igemm/4x2-neon-ld64.c
-  src/f32-igemm/4x4-neon-ld64.c
-  src/f32-igemm/4x8-neon-ld128.c
-  src/f32-igemm/4x8-neon-ld64.c
-  src/f32-igemm/4x8s4-neon.c
-  src/f32-igemm/6x8-neon-ld64.c
-  src/f32-igemm/6x8s4-neon.c
-  src/f32-igemm/8x8s4-neon.c
-  src/f32-pavgpool/mp9p8q-neon.c
-  src/f32-pavgpool/up9-neon.c
-  src/f32-ppmm/4x8-neon.c
-  src/f32-ppmm/8x8-neon.c
-  src/f32-prelu/neon-2x4.c
-  src/f32-prelu/neon-2x8.c
-  src/f32-rmax/neon.c
-  src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
-  src/f32-vbinary/vadd-neon-x4.c
-  src/f32-vbinary/vadd-neon-x8.c
-  src/f32-vbinary/vaddc-neon-x4.c
-  src/f32-vbinary/vaddc-neon-x8.c
-  src/f32-vbinary/vmul-neon-x4.c
-  src/f32-vbinary/vmul-neon-x8.c
-  src/f32-vbinary/vmulc-neon-x4.c
-  src/f32-vbinary/vmulc-neon-x8.c
-  src/f32-vbinary/vrsubc-neon-x4.c
-  src/f32-vbinary/vrsubc-neon-x8.c
-  src/f32-vbinary/vsub-neon-x4.c
-  src/f32-vbinary/vsub-neon-x8.c
-  src/f32-vbinary/vsubc-neon-x4.c
-  src/f32-vbinary/vsubc-neon-x8.c
-  src/f32-vmulcaddc/c4-neon-2x.c
-  src/f32-vmulcaddc/c8-neon-2x.c
-  src/q8-avgpool/mp9p8q-neon.c
-  src/q8-avgpool/up9-neon.c
-  src/q8-dwconv/up8x9-neon.c
-  src/q8-gavgpool/mp7p7q-neon.c
-  src/q8-gavgpool/up7-neon.c
-  src/q8-gemm/4x8-neon.c
-  src/q8-gemm/8x8-neon.c
-  src/q8-igemm/4x8-neon.c
-  src/q8-igemm/8x8-neon.c
-  src/q8-vadd/neon.c
-  src/u8-clamp/neon.c
-  src/u8-maxpool/9p8x-neon-c16.c
-  src/u8-rmax/neon.c
-  src/x32-packx/x4-neon-st4.c
-  src/x32-pad/x2-neon.c
-  src/x32-zip/x2-neon.c
-  src/x32-zip/x3-neon.c
-  src/x32-zip/x4-neon.c
-  src/x32-zip/xm-neon.c
-  src/x8-zip/x2-neon.c
-  src/x8-zip/x3-neon.c
-  src/x8-zip/x4-neon.c
-  src/x8-zip/xm-neon.c)
+    src/f32-avgpool/mp9p8q-neon.c
+    src/f32-avgpool/up9-neon.c
+    src/f32-bilinear/neon-c4.c
+    src/f32-bilinear/neon-c8.c
+    src/f32-clamp/neon.c
+    src/f32-dwconv/up4x9-neon.c
+    src/f32-dwconv/up4x9-neon-acc2.c
+    src/f32-dwconv/up8x9-neon.c
+    src/f32-dwconv/up8x9-neon-acc2.c
+    src/f32-gavgpool-spchw/neon-x4.c
+    src/f32-gavgpool/mp7p7q-neon.c
+    src/f32-gavgpool/up7-neon.c
+    src/f32-gemm/1x8-neon-lane-ld64.c
+    src/f32-gemm/4x2-neon-lane-ld64.c
+    src/f32-gemm/4x8-neon-lane-ld128.c
+    src/f32-gemm/4x8-neon-lane-ld64.c
+    src/f32-gemm/5x8-neon-lane-ld64.c
+    src/f32-gemm/6x8-neon-lane-ld64.c
+    src/f32-gemm/1x8s4-neon.c
+    src/f32-gemm/4x8s4-neon.c
+    src/f32-gemm/6x8s4-neon.c
+    src/f32-gemm/8x8s4-neon.c
+    src/f32-gemminc/1x8-neon-lane-ld64.c
+    src/f32-gemminc/4x8-neon-lane-ld128.c
+    src/f32-gemminc/4x8-neon-lane-ld64.c
+    src/f32-gemminc/5x8-neon-lane-ld64.c
+    src/f32-gemminc/6x8-neon-lane-ld64.c
+    src/f32-gemminc/1x8s4-neon.c
+    src/f32-gemminc/4x8s4-neon.c
+    src/f32-gemminc/6x8s4-neon.c
+    src/f32-gemminc/8x8s4-neon.c
+    src/f32-hswish/neon.c
+    src/f32-igemm/1x8-neon-lane-ld64.c
+    src/f32-igemm/4x2-neon-lane-ld64.c
+    src/f32-igemm/4x4-neon-lane-ld64.c
+    src/f32-igemm/4x8-neon-lane-ld128.c
+    src/f32-igemm/4x8-neon-lane-ld64.c
+    src/f32-igemm/6x8-neon-lane-ld64.c
+    src/f32-igemm/1x8s4-neon.c
+    src/f32-igemm/4x8s4-neon.c
+    src/f32-igemm/6x8s4-neon.c
+    src/f32-igemm/8x8s4-neon.c
+    src/f32-pavgpool/mp9p8q-neon.c
+    src/f32-pavgpool/up9-neon.c
+    src/f32-ppmm/4x8-neon.c
+    src/f32-ppmm/8x8-neon.c
+    src/f32-prelu/neon-2x4.c
+    src/f32-prelu/neon-2x8.c
+    src/f32-rmax/neon.c
+    src/f32-sigmoid/neon-frac-p9-p10-nr1recps-x16.c
+    src/f32-vbinary/vadd-neon-x4.c
+    src/f32-vbinary/vadd-neon-x8.c
+    src/f32-vbinary/vaddc-neon-x4.c
+    src/f32-vbinary/vaddc-neon-x8.c
+    src/f32-vbinary/vmul-neon-x4.c
+    src/f32-vbinary/vmul-neon-x8.c
+    src/f32-vbinary/vmulc-neon-x4.c
+    src/f32-vbinary/vmulc-neon-x8.c
+    src/f32-vbinary/vrsubc-neon-x4.c
+    src/f32-vbinary/vrsubc-neon-x8.c
+    src/f32-vbinary/vsub-neon-x4.c
+    src/f32-vbinary/vsub-neon-x8.c
+    src/f32-vbinary/vsubc-neon-x4.c
+    src/f32-vbinary/vsubc-neon-x8.c
+    src/f32-vmulcaddc/c4-neon-2x.c
+    src/f32-vmulcaddc/c8-neon-2x.c
+    src/q8-avgpool/mp9p8q-neon.c
+    src/q8-avgpool/up9-neon.c
+    src/q8-dwconv/up8x9-neon.c
+    src/q8-gavgpool/mp7p7q-neon.c
+    src/q8-gavgpool/up7-neon.c
+    src/q8-gemm/4x8-neon.c
+    src/q8-gemm/8x8-neon.c
+    src/q8-igemm/4x8-neon.c
+    src/q8-igemm/8x8-neon.c
+    src/q8-vadd/neon.c
+    src/u8-clamp/neon.c
+    src/u8-maxpool/9p8x-neon-c16.c
+    src/u8-rmax/neon.c
+    src/x32-packx/x4-neon-st4.c
+    src/x32-pad/x2-neon.c
+    src/x32-zip/x2-neon.c
+    src/x32-zip/x3-neon.c
+    src/x32-zip/x4-neon.c
+    src/x32-zip/xm-neon.c
+    src/x8-zip/x2-neon.c
+    src/x8-zip/x3-neon.c
+    src/x8-zip/x4-neon.c
+    src/x8-zip/xm-neon.c)
 
 SET(XNNPACK_NEONFMA_MICROKERNEL_SRCS
-  src/f32-bilinear/neonfma-c4.c
-  src/f32-bilinear/neonfma-c8.c
-  src/f32-igemm/1x8-neonfma-ld64.c
-  src/f32-igemm/4x2-neonfma-ld64.c
-  src/f32-igemm/4x4-neonfma-ld64.c
-  src/f32-igemm/4x8-neonfma-ld128.c
-  src/f32-igemm/4x8-neonfma-ld64.c
-  src/f32-igemm/6x8-neonfma-ld64.c
-  src/f32-igemm/1x8s4-neonfma.c
-  src/f32-igemm/4x8s4-neonfma.c
-  src/f32-igemm/6x8s4-neonfma.c
-  src/f32-igemm/8x8s4-neonfma.c
-  src/f32-dwconv/up4x9-neonfma.c
-  src/f32-dwconv/up4x9-neonfma-acc2.c
-  src/f32-dwconv/up8x9-neonfma.c
-  src/f32-dwconv/up8x9-neonfma-acc2.c
-  src/f32-gemm/1x8-neonfma-ld64.c
-  src/f32-gemm/4x2-neonfma-ld64.c
-  src/f32-gemm/4x8-neonfma-ld128.c
-  src/f32-gemm/4x8-neonfma-ld64.c
-  src/f32-gemm/5x8-neonfma-ld64.c
-  src/f32-gemm/6x8-neonfma-ld64.c
-  src/f32-gemm/1x8s4-neonfma.c
-  src/f32-gemm/4x8s4-neonfma.c
-  src/f32-gemm/6x8s4-neonfma.c
-  src/f32-gemm/8x8s4-neonfma.c
-  src/f32-gemminc/1x8-neonfma-ld64.c
-  src/f32-gemminc/4x8-neonfma-ld128.c
-  src/f32-gemminc/4x8-neonfma-ld64.c
-  src/f32-gemminc/5x8-neonfma-ld64.c
-  src/f32-gemminc/6x8-neonfma-ld64.c
-  src/f32-gemminc/1x8s4-neonfma.c
-  src/f32-gemminc/4x8s4-neonfma.c
-  src/f32-gemminc/6x8s4-neonfma.c
-  src/f32-gemminc/8x8s4-neonfma.c
-  src/f32-hswish/neonfma.c
-  src/f32-ppmm/4x8-neonfma.c
-  src/f32-ppmm/8x8-neonfma.c
-  src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
-  src/f32-vmulcaddc/c4-neonfma-2x.c
-  src/f32-vmulcaddc/c8-neonfma-2x.c
-  src/math/exp-neonfma-lut64-p2.c
-  src/math/exp-neonfma-p5.c
-  src/math/expminus-neonfma-p5.c
-  src/math/sigmoid-neonfma-p5-nr2fma.c)
+    src/f32-bilinear/neonfma-c4.c
+    src/f32-bilinear/neonfma-c8.c
+    src/f32-igemm/1x8s4-neonfma.c
+    src/f32-igemm/4x8s4-neonfma.c
+    src/f32-igemm/6x8s4-neonfma.c
+    src/f32-igemm/8x8s4-neonfma.c
+    src/f32-dwconv/up4x9-neonfma.c
+    src/f32-dwconv/up4x9-neonfma-acc2.c
+    src/f32-dwconv/up8x9-neonfma.c
+    src/f32-dwconv/up8x9-neonfma-acc2.c
+    src/f32-gemm/1x8s4-neonfma.c
+    src/f32-gemm/4x8s4-neonfma.c
+    src/f32-gemm/6x8s4-neonfma.c
+    src/f32-gemm/8x8s4-neonfma.c
+    src/f32-gemminc/1x8s4-neonfma.c
+    src/f32-gemminc/4x8s4-neonfma.c
+    src/f32-gemminc/6x8s4-neonfma.c
+    src/f32-gemminc/8x8s4-neonfma.c
+    src/f32-hswish/neonfma.c
+    src/f32-ppmm/4x8-neonfma.c
+    src/f32-ppmm/8x8-neonfma.c
+    src/f32-sigmoid/neonfma-p5-nr2fma-x16.c
+    src/f32-vmulcaddc/c4-neonfma-2x.c
+    src/f32-vmulcaddc/c8-neonfma-2x.c
+    src/math/exp-neonfma-lut64-p2.c
+    src/math/exp-neonfma-p5.c
+    src/math/expminus-neonfma-p5.c
+    src/math/sigmoid-neonfma-p5-nr2fma.c)
 
 SET(AARCH64_XNNPACK_NEONFMA_MICROKERNEL_SRCS
-  src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c
-  src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c
-  src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
-  src/f32-dwconv-spchw/3x3p1-neonfma.c
-  src/f32-dwconv-spchw/5x5p2-neonfma.c
-  src/f32-dwconv-spchw/3x3s2p1-neonfma.c
-  src/f32-dwconv-spchw/5x5s2p2-neonfma.c
-  src/f32-spmm/12x1-neonfma.c
-  src/f32-spmm/12x2-neonfma.c
-  src/f32-spmm/12x4-neonfma.c
-  src/f32-spmm/16x1-neonfma-pipelined.c
-  src/f32-spmm/16x1-neonfma-unroll2.c
-  src/f32-spmm/16x1-neonfma.c
-  src/f32-spmm/16x2-neonfma.c
-  src/f32-spmm/16x4-neonfma.c
-  src/f32-spmm/4x1-neonfma-pipelined.c
-  src/f32-spmm/4x1-neonfma-unroll2.c
-  src/f32-spmm/4x1-neonfma.c
-  src/f32-spmm/4x2-neonfma.c
-  src/f32-spmm/4x4-neonfma.c
-  src/f32-spmm/8x1-neonfma-pipelined.c
-  src/f32-spmm/8x1-neonfma-unroll2.c
-  src/f32-spmm/8x1-neonfma.c
-  src/f32-spmm/8x2-neonfma.c
-  src/f32-spmm/8x4-neonfma.c)
+    src/f32-gemm/1x8-neonfma-lane-ld64.c
+    src/f32-gemm/4x2-neonfma-lane-ld64.c
+    src/f32-gemm/4x8-neonfma-lane-ld128.c
+    src/f32-gemm/4x8-neonfma-lane-ld64.c
+    src/f32-gemm/5x8-neonfma-lane-ld64.c
+    src/f32-gemm/6x8-neonfma-lane-ld64.c
+    src/f32-gemminc/1x8-neonfma-lane-ld64.c
+    src/f32-gemminc/4x8-neonfma-lane-ld128.c
+    src/f32-gemminc/4x8-neonfma-lane-ld64.c
+    src/f32-gemminc/5x8-neonfma-lane-ld64.c
+    src/f32-gemminc/6x8-neonfma-lane-ld64.c
+    src/f32-igemm/1x8-neonfma-lane-ld64.c
+    src/f32-igemm/4x2-neonfma-lane-ld64.c
+    src/f32-igemm/4x4-neonfma-lane-ld64.c
+    src/f32-igemm/4x8-neonfma-lane-ld128.c
+    src/f32-igemm/4x8-neonfma-lane-ld64.c
+    src/f32-igemm/6x8-neonfma-lane-ld64.c
+    src/f32-conv-hwc/3x3s2p1c3x4-neonfma-2x2.c
+    src/f32-conv-hwc/3x3s2p1c3x8-neonfma-2x2.c
+    src/f32-conv-hwc2spchw/3x3s2p1c3x4-neonfma-2x2.c
+    src/f32-dwconv-spchw/3x3p1-neonfma.c
+    src/f32-dwconv-spchw/5x5p2-neonfma.c
+    src/f32-dwconv-spchw/3x3s2p1-neonfma.c
+    src/f32-dwconv-spchw/5x5s2p2-neonfma.c
+    src/f32-spmm/12x1-neonfma.c
+    src/f32-spmm/12x2-neonfma.c
+    src/f32-spmm/12x4-neonfma.c
+    src/f32-spmm/16x1-neonfma-pipelined.c
+    src/f32-spmm/16x1-neonfma-unroll2.c
+    src/f32-spmm/16x1-neonfma.c
+    src/f32-spmm/16x2-neonfma.c
+    src/f32-spmm/16x4-neonfma.c
+    src/f32-spmm/4x1-neonfma-pipelined.c
+    src/f32-spmm/4x1-neonfma-unroll2.c
+    src/f32-spmm/4x1-neonfma.c
+    src/f32-spmm/4x2-neonfma.c
+    src/f32-spmm/4x4-neonfma.c
+    src/f32-spmm/8x1-neonfma-pipelined.c
+    src/f32-spmm/8x1-neonfma-unroll2.c
+    src/f32-spmm/8x1-neonfma.c
+    src/f32-spmm/8x2-neonfma.c
+    src/f32-spmm/8x4-neonfma.c)
 
 SET(XNNPACK_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS
   src/f16-gemm/4x8-neonfp16arith-ld64.c
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index 801d08b..dfe32ff 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -104,18 +104,18 @@
   static void f32_gemm_4x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64,
-      xnn_f32_igemm_ukernel_4x8__neonfma_ld64,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+      xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
       4 /* mr */, 8 /* nr */);
   }
 
   static void f32_gemm_4x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128,
-      xnn_f32_igemm_ukernel_4x8__neonfma_ld128,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+      xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
       4 /* mr */, 8 /* nr */);
   }
 
@@ -167,18 +167,45 @@
   static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64,
-      xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+      xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
       6 /* mr */, 8 /* nr */);
   }
 
   static void f32_gemm_6x8__aarch64_neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128,
-      xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+      xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
+      6 /* mr */, 8 /* nr */);
+  }
+
+  static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
+      4 /* mr */, 8 /* nr */);
+  }
+
+  static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128,
+      xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
+      4 /* mr */, 8 /* nr */);
+  }
+
+  static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
       6 /* mr */, 8 /* nr */);
   }
 
@@ -220,60 +247,42 @@
 
   BENCHMARK_CAPTURE(f32_gemm_6x8__aarch64_neonfma_ld128, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_gemm_6x8__aarch64_neonfma_ld128, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_lane_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_lane_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_lane_ld128, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_lane_ld128, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+
+  BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_lane_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_lane_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  static void f32_gemm_4x8__neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+  static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
-      xnn_f32_gemm_ukernel_4x8__neon_ld64,
-      xnn_f32_igemm_ukernel_4x8__neon_ld64,
-      xnn_f32_gemm_ukernel_1x8__neon_ld64,
-      xnn_f32_igemm_ukernel_1x8__neon_ld64,
+      xnn_f32_gemm_ukernel_4x8__neon_lane_ld64,
+      xnn_f32_igemm_ukernel_4x8__neon_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
       4 /* mr */, 8 /* nr */);
   }
 
-  static void f32_gemm_4x8__neon_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
+  static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
-      xnn_f32_gemm_ukernel_4x8__neon_ld128,
-      xnn_f32_igemm_ukernel_4x8__neon_ld128,
-      xnn_f32_gemm_ukernel_1x8__neon_ld64,
-      xnn_f32_igemm_ukernel_1x8__neon_ld64,
+      xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
+      xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
+      xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
       4 /* mr */, 8 /* nr */);
   }
 
-  static void f32_gemm_6x8__neon_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+  static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
-      xnn_f32_gemm_ukernel_6x8__neon_ld64,
-      xnn_f32_igemm_ukernel_6x8__neon_ld64,
-      xnn_f32_gemm_ukernel_1x8__neon_ld64,
-      xnn_f32_igemm_ukernel_1x8__neon_ld64,
-      6 /* mr */, 8 /* nr */);
-  }
-
-  static void f32_gemm_4x8__neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
-    GEMMEnd2EndBenchmark(state, model,
-      xnn_f32_gemm_ukernel_4x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_4x8__neonfma_ld64,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
-      4 /* mr */, 8 /* nr */);
-  }
-
-  static void f32_gemm_4x8__neonfma_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
-    GEMMEnd2EndBenchmark(state, model,
-      xnn_f32_gemm_ukernel_4x8__neonfma_ld128,
-      xnn_f32_igemm_ukernel_4x8__neonfma_ld128,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
-      4 /* mr */, 8 /* nr */);
-  }
-
-  static void f32_gemm_6x8__neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
-    GEMMEnd2EndBenchmark(state, model,
-      xnn_f32_gemm_ukernel_6x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
-      xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-      xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+      xnn_f32_gemm_ukernel_6x8__neon_lane_ld64,
+      xnn_f32_igemm_ukernel_6x8__neon_lane_ld64,
+      xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+      xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
       6 /* mr */, 8 /* nr */);
   }
 
@@ -331,23 +340,14 @@
       8 /* mr */, 8 /* nr */, 0 /* log2(kr) */, 2 /* log2(sr) */);
   }
 
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_lane_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_lane_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_ld128, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_ld128, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_lane_ld128, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_4x8__neon_lane_ld128, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
-  BENCHMARK_CAPTURE(f32_gemm_6x8__neon_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
-  BENCHMARK_CAPTURE(f32_gemm_6x8__neon_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
-
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
-
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_ld128, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
-  BENCHMARK_CAPTURE(f32_gemm_4x8__neonfma_ld128, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
-
-  BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
-  BENCHMARK_CAPTURE(f32_gemm_6x8__neonfma_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_6x8__neon_lane_ld64, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
+  BENCHMARK_CAPTURE(f32_gemm_6x8__neon_lane_ld64, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
 
   BENCHMARK_CAPTURE(f32_gemm_4x8s4__neon, mobilenet_v1, models::MobileNetV1)->Unit(benchmark::kMicrosecond)->UseRealTime();
   BENCHMARK_CAPTURE(f32_gemm_4x8s4__neon, mobilenet_v2, models::MobileNetV2)->Unit(benchmark::kMicrosecond)->UseRealTime();
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index e47a621..1467aef 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -387,6 +387,25 @@
   static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
   }
+  static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
+  }
+
+  static void f32_gemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
+  }
+
+  static void f32_gemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
+  }
+
+  static void f32_gemm_5x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64, 5, 8, 1, 1);
+  }
+
+  static void f32_gemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
+  }
 
   BENCHMARK_GEMM(f32_gemm_1x12__aarch64_neonfma_cortex_a53)
   BENCHMARK_GEMM(f32_gemm_1x8__aarch64_neonfma_cortex_a53)
@@ -405,47 +424,33 @@
   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
   BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
+  BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld128)
+  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_5x8__neonfma_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_6x8__neonfma_lane_ld64)
+
 #endif  // XNN_ARCH_ARM64
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  static void f32_gemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
+  static void f32_gemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
   }
 
-  static void f32_gemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
+  static void f32_gemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
   }
 
-  static void f32_gemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
+  static void f32_gemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
   }
 
-  static void f32_gemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
+  static void f32_gemm_5x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_lane_ld64, 5, 8, 1, 1);
   }
 
-  static void f32_gemm_5x8__neon_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neon_ld64, 5, 8, 1, 1);
-  }
-
-  static void f32_gemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
-  }
-
-  static void f32_gemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
-  }
-
-  static void f32_gemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
-  }
-
-  static void f32_gemm_5x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_5x8__neonfma_ld64, 5, 8, 1, 1);
-  }
-
-  static void f32_gemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
+  static void f32_gemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
   }
 
   static void f32_gemm_1x8s4__neon(benchmark::State& state, const char* net) {
@@ -488,14 +493,10 @@
     PPMM2PBenchmark(state, xnn_f32_ppmm_ukernel_4x8__neonfma, xnn_x32_packx_ukernel_4x__neon_st4, 4, 8);
   }
 
-  BENCHMARK_GEMM(f32_gemm_1x8__neon_ld64)
-  BENCHMARK_GEMM(f32_gemm_1x8__neonfma_ld64)
-  BENCHMARK_GEMM(f32_gemm_4x8__neon_ld128)
-  BENCHMARK_GEMM(f32_gemm_4x8__neon_ld64)
-  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_ld128)
-  BENCHMARK_GEMM(f32_gemm_4x8__neonfma_ld64)
-  BENCHMARK_GEMM(f32_gemm_5x8__neon_ld64)
-  BENCHMARK_GEMM(f32_gemm_5x8__neonfma_ld64)
+  BENCHMARK_GEMM(f32_gemm_1x8__neon_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld128)
+  BENCHMARK_GEMM(f32_gemm_4x8__neon_lane_ld64)
+  BENCHMARK_GEMM(f32_gemm_5x8__neon_lane_ld64)
   BENCHMARK_GEMM(f32_gemm_1x8s4__neon)
   BENCHMARK_GEMM(f32_gemm_1x8s4__neonfma)
   BENCHMARK_GEMM(f32_gemm_4x8s4__neon)
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 2bfe51c..ac0a677 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -151,52 +151,28 @@
 }
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  static void f32_igemm_1x8__neon_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_ld64, 1, 8, 1, 1);
+  static void f32_igemm_1x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neon_lane_ld64, 1, 8, 1, 1);
   }
 
-  static void f32_igemm_4x2__neon_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_ld64, 4, 2, 1, 1);
+  static void f32_igemm_4x2__neon_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neon_lane_ld64, 4, 2, 1, 1);
   }
 
-  static void f32_igemm_4x4__neon_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_ld64, 4, 4, 1, 1);
+  static void f32_igemm_4x4__neon_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neon_lane_ld64, 4, 4, 1, 1);
   }
 
-  static void f32_igemm_4x8__neon_ld128(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_ld128, 4, 8, 1, 1);
+  static void f32_igemm_4x8__neon_lane_ld128(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld128, 4, 8, 1, 1);
   }
 
-  static void f32_igemm_4x8__neon_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_ld64, 4, 8, 1, 1);
+  static void f32_igemm_4x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neon_lane_ld64, 4, 8, 1, 1);
   }
 
-  static void f32_igemm_6x8__neon_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_ld64, 6, 8, 1, 1);
-  }
-
-  static void f32_igemm_1x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_ld64, 1, 8, 1, 1);
-  }
-
-  static void f32_igemm_4x2__neonfma_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neonfma_ld64, 4, 2, 1, 1);
-  }
-
-  static void f32_igemm_4x4__neonfma_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neonfma_ld64, 4, 4, 1, 1);
-  }
-
-  static void f32_igemm_4x8__neonfma_ld128(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_ld128, 4, 8, 1, 1);
-  }
-
-  static void f32_igemm_4x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_ld64, 4, 8, 1, 1);
-  }
-
-  static void f32_igemm_6x8__neonfma_ld64(benchmark::State& state, const char* net) {
-    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_ld64, 6, 8, 1, 1);
+  static void f32_igemm_6x8__neon_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neon_lane_ld64, 6, 8, 1, 1);
   }
 
   static void f32_igemm_1x8s4__neon(benchmark::State& state, const char* net) {
@@ -231,16 +207,12 @@
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_8x8s4__neonfma, 8, 8, 1, 4);
   }
 
-  BENCHMARK_CONV(f32_igemm_1x8__neon_ld64)
-  BENCHMARK_CONV(f32_igemm_1x8__neonfma_ld64)
-  BENCHMARK_CONV(f32_igemm_4x2__neon_ld64)
-  BENCHMARK_CONV(f32_igemm_4x2__neonfma_ld64)
-  BENCHMARK_CONV(f32_igemm_4x4__neon_ld64)
-  BENCHMARK_CONV(f32_igemm_4x4__neonfma_ld64)
-  BENCHMARK_CONV(f32_igemm_4x8__neon_ld128)
-  BENCHMARK_CONV(f32_igemm_4x8__neon_ld64)
-  BENCHMARK_CONV(f32_igemm_4x8__neonfma_ld128)
-  BENCHMARK_CONV(f32_igemm_4x8__neonfma_ld64)
+  BENCHMARK_CONV(f32_igemm_1x8__neon_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_4x2__neon_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_4x4__neon_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld128)
+  BENCHMARK_CONV(f32_igemm_4x8__neon_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_6x8__neon_lane_ld64)
   BENCHMARK_CONV(f32_igemm_1x8s4__neon)
   BENCHMARK_CONV(f32_igemm_1x8s4__neonfma)
   BENCHMARK_CONV(f32_igemm_4x8s4__neon)
@@ -249,8 +221,6 @@
   BENCHMARK_CONV(f32_igemm_6x8s4__neonfma)
   BENCHMARK_CONV(f32_igemm_8x8s4__neon)
   BENCHMARK_CONV(f32_igemm_8x8s4__neonfma)
-  BENCHMARK_CONV(f32_igemm_6x8__neon_ld64)
-  BENCHMARK_CONV(f32_igemm_6x8__neonfma_ld64)
 #endif
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
@@ -302,6 +272,30 @@
     IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
   }
 
+  static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x2__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64, 4, 2, 1, 1);
+  }
+
+  static void f32_igemm_4x4__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64, 4, 4, 1, 1);
+  }
+
+  static void f32_igemm_4x8__neonfma_lane_ld128(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_4x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64, 4, 8, 1, 1);
+  }
+
+  static void f32_igemm_6x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
+    IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64, 6, 8, 1, 1);
+  }
+
   BENCHMARK_CONV(f32_igemm_1x12__aarch64_neonfma_cortex_a53)
   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a53)
   BENCHMARK_CONV(f32_igemm_1x8__aarch64_neonfma_cortex_a57)
@@ -314,6 +308,12 @@
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a57)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
   BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
+  BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld128)
+  BENCHMARK_CONV(f32_igemm_4x8__neonfma_lane_ld64)
+  BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld64)
 #endif  /* XNN_ARCH_ARM64 */
 
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/scripts/generate-f32-gemm.sh b/scripts/generate-f32-gemm.sh
index 0c6567b..4332dc3 100755
--- a/scripts/generate-f32-gemm.sh
+++ b/scripts/generate-f32-gemm.sh
@@ -70,39 +70,32 @@
 
 ################################### ARM NEON ##################################
 ### LD64 micro-kernels
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/1x8-neonfma-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/1x8-neonfma-ld64.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/4x8-neon-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/4x8-neon-ld64.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/4x8-neonfma-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/4x8-neonfma-ld64.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/5x8-neon-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/5x8-neon-ld64.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/5x8-neonfma-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/5x8-neonfma-ld64.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/6x8-neon-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/6x8-neon-ld64.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/6x8-neonfma-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/6x8-neonfma-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/1x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/1x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/5x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/5x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/5x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=5 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/5x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/6x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/6x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/6x8-neonfma-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=6 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/6x8-neonfma-lane-ld64.c
 ### LD128 micro-kernels
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/4x8-neon-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/4x8-neon-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neon-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neon-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/4x8-neonfma-lane-ld128.c
+tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -D DUP=0 -o src/f32-gemminc/4x8-neonfma-lane-ld128.c
 
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=0 -o src/f32-gemm/4x8-neonfma-ld128.c
-tools/xngen src/f32-gemm/neon-ld128.c.in     -D MR=4 -D NR=8  -D FMA=1 -D INC=1 -o src/f32-gemminc/4x8-neonfma-ld128.c
-
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/1x8-neon-ld64.c
-tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -o src/f32-gemminc/1x8-neon-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/1x8-neon-lane-ld64.c
+tools/xngen src/f32-gemm/neon-ld64.c.in      -D MR=1 -D NR=8  -D FMA=0 -D INC=1 -D DUP=0 -o src/f32-gemminc/1x8-neon-lane-ld64.c
 ### MRx2 micro-kernels
-tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=0 -D INC=0 -o src/f32-gemm/4x2-neon-ld64.c
+tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=0 -D INC=0 -D DUP=0 -o src/f32-gemm/4x2-neon-lane-ld64.c
 
-tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=1 -D INC=0 -o src/f32-gemm/4x2-neonfma-ld64.c
+tools/xngen src/f32-gemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2  -D FMA=1 -D INC=0 -D DUP=0 -o src/f32-gemm/4x2-neonfma-lane-ld64.c
 
 ### LOAD4+PERMUTE micro-kernels
 tools/xngen src/f32-gemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -D INC=0 -o src/f32-gemm/1x8s4-neon.c
diff --git a/scripts/generate-f32-igemm.sh b/scripts/generate-f32-igemm.sh
index ec34c0c..12b1160 100755
--- a/scripts/generate-f32-igemm.sh
+++ b/scripts/generate-f32-igemm.sh
@@ -12,29 +12,29 @@
 
 ################################### ARM NEON ##################################
 ### LD64 micro-kernels
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=1 -D NR=8  -D FMA=0 -o src/f32-igemm/1x8-neon-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=1 -D NR=8  -D FMA=1 -o src/f32-igemm/1x8-neonfma-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=4  -D FMA=0 -o src/f32-igemm/4x4-neon-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=4  -D FMA=1 -o src/f32-igemm/4x4-neonfma-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=8  -D FMA=0 -o src/f32-igemm/4x8-neon-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=4 -D NR=8  -D FMA=1 -o src/f32-igemm/4x8-neonfma-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=6 -D NR=8  -D FMA=0 -o src/f32-igemm/6x8-neon-ld64.c
-tools/xngen src/f32-igemm/neon-ld64.c.in -D MR=6 -D NR=8  -D FMA=1 -o src/f32-igemm/6x8-neonfma-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/1x8-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=1 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/1x8-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=4 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x4-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=4 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x4-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x8-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=4 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x8-neonfma-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/6x8-neon-lane-ld64.c
+tools/xngen src/f32-igemm/neon-ld64.c.in      -D MR=6 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/6x8-neonfma-lane-ld64.c
 ### LD128 micro-kernels
-tools/xngen src/f32-igemm/neon-ld128.c.in -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/4x8-neon-ld128.c
-tools/xngen src/f32-igemm/neon-ld128.c.in -D MR=4 -D NR=8 -D FMA=1 -o src/f32-igemm/4x8-neonfma-ld128.c
-### MRx2 micro-kernels
-tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=0 -o src/f32-igemm/4x2-neon-ld64.c
-tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=1 -o src/f32-igemm/4x2-neonfma-ld64.c
+tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x8-neon-lane-ld128.c
+tools/xngen src/f32-igemm/neon-ld128.c.in     -D MR=4 -D NR=8 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x8-neonfma-lane-ld128.c
+### MRx2 micro-kernels-
+tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=0 -D DUP=0 -o src/f32-igemm/4x2-neon-lane-ld64.c
+tools/xngen src/f32-igemm/MRx2-neon-ld64.c.in -D MR=4 -D NR=2 -D FMA=1 -D DUP=0 -o src/f32-igemm/4x2-neonfma-lane-ld64.c
 ### LOAD4+PERMUTE micro-kernels
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=0 -o src/f32-igemm/1x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8  -D FMA=1 -o src/f32-igemm/1x8s4-neonfma.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=0 -o src/f32-igemm/4x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8  -D FMA=1 -o src/f32-igemm/4x8s4-neonfma.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=0 -o src/f32-igemm/6x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8  -D FMA=1 -o src/f32-igemm/6x8s4-neonfma.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=0 -o src/f32-igemm/8x8s4-neon.c
-tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8  -D FMA=1 -o src/f32-igemm/8x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8 -D FMA=0 -o src/f32-igemm/1x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=1 -D NR=8 -D FMA=1 -o src/f32-igemm/1x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8 -D FMA=0 -o src/f32-igemm/4x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=4 -D NR=8 -D FMA=1 -o src/f32-igemm/4x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8 -D FMA=0 -o src/f32-igemm/6x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=6 -D NR=8 -D FMA=1 -o src/f32-igemm/6x8s4-neonfma.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8 -D FMA=0 -o src/f32-igemm/8x8s4-neon.c
+tools/xngen src/f32-igemm/neon-shuffle.c.in   -D MR=8 -D NR=8 -D FMA=1 -o src/f32-igemm/8x8s4-neonfma.c
 
 #################################### PSIMD ####################################
 ### LOAD1+BROADCAST micro-kernels
diff --git a/src/f32-gemm/1x8-neon-ld64.c b/src/f32-gemm/1x8-neon-lane-ld64.c
similarity index 85%
rename from src/f32-gemm/1x8-neon-ld64.c
rename to src/f32-gemm/1x8-neon-lane-ld64.c
index 9c6f8b6..31f7e17 100644
--- a/src/f32-gemm/1x8-neon-ld64.c
+++ b/src/f32-gemm/1x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_1x8__neon_ld64(
+void xnn_f32_gemm_ukernel_1x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -49,13 +50,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -63,8 +64,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/1x8-neon-ld64.c b/src/f32-gemm/1x8-neonfma-lane-ld64.c
similarity index 84%
copy from src/f32-gemm/1x8-neon-ld64.c
copy to src/f32-gemm/1x8-neonfma-lane-ld64.c
index 9c6f8b6..428e181 100644
--- a/src/f32-gemm/1x8-neon-ld64.c
+++ b/src/f32-gemm/1x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_1x8__neon_ld64(
+void xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -49,13 +50,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -63,8 +64,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/1x8-neonfma-ld64.c b/src/f32-gemm/1x8-neonfma-ld64.c
deleted file mode 100644
index 4df05f9..0000000
--- a/src/f32-gemm/1x8-neonfma-ld64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_1x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/4x2-neon-ld64.c b/src/f32-gemm/4x2-neon-lane-ld64.c
similarity index 98%
rename from src/f32-gemm/4x2-neon-ld64.c
rename to src/f32-gemm/4x2-neon-lane-ld64.c
index 872a842..0ec7cbb 100644
--- a/src/f32-gemm/4x2-neon-ld64.c
+++ b/src/f32-gemm/4x2-neon-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x2__neon_ld64(
+void xnn_f32_gemm_ukernel_4x2__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-gemm/4x2-neonfma-ld64.c b/src/f32-gemm/4x2-neonfma-lane-ld64.c
similarity index 98%
rename from src/f32-gemm/4x2-neonfma-ld64.c
rename to src/f32-gemm/4x2-neonfma-lane-ld64.c
index 4014c0e..bf81b04 100644
--- a/src/f32-gemm/4x2-neonfma-ld64.c
+++ b/src/f32-gemm/4x2-neonfma-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x2__neonfma_ld64(
+void xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-gemm/4x8-neon-ld128.c b/src/f32-gemm/4x8-neon-lane-ld128.c
similarity index 64%
copy from src/f32-gemm/4x8-neon-ld128.c
copy to src/f32-gemm/4x8-neon-lane-ld128.c
index 3ebede2..b8561d3 100644
--- a/src/f32-gemm/4x8-neon-ld128.c
+++ b/src/f32-gemm/4x8-neon-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld128(
+void xnn_f32_gemm_ukernel_4x8__neon_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -77,50 +78,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -132,14 +133,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemm/4x8-neon-ld64.c b/src/f32-gemm/4x8-neon-lane-ld64.c
similarity index 77%
copy from src/f32-gemm/4x8-neon-ld64.c
copy to src/f32-gemm/4x8-neon-lane-ld64.c
index 6c19c67..af9ff56 100644
--- a/src/f32-gemm/4x8-neon-ld64.c
+++ b/src/f32-gemm/4x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld64(
+void xnn_f32_gemm_ukernel_4x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -76,25 +77,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -105,14 +106,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/4x8-neon-ld128.c b/src/f32-gemm/4x8-neonfma-lane-ld128.c
similarity index 64%
rename from src/f32-gemm/4x8-neon-ld128.c
rename to src/f32-gemm/4x8-neonfma-lane-ld128.c
index 3ebede2..ee3e06a 100644
--- a/src/f32-gemm/4x8-neon-ld128.c
+++ b/src/f32-gemm/4x8-neonfma-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld128(
+void xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -77,50 +78,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -132,14 +133,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemm/4x8-neon-ld64.c b/src/f32-gemm/4x8-neonfma-lane-ld64.c
similarity index 77%
rename from src/f32-gemm/4x8-neon-ld64.c
rename to src/f32-gemm/4x8-neonfma-lane-ld64.c
index 6c19c67..6a7b6f9 100644
--- a/src/f32-gemm/4x8-neon-ld64.c
+++ b/src/f32-gemm/4x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld64(
+void xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -76,25 +77,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -105,14 +106,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/4x8-neonfma-ld128.c b/src/f32-gemm/4x8-neonfma-ld128.c
deleted file mode 100644
index 08888d6..0000000
--- a/src/f32-gemm/4x8-neonfma-ld128.c
+++ /dev/null
@@ -1,285 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld128.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_4x8__neonfma_ld128(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
-      const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
-      const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
-      const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
-      const float32x4_t va3 = vld1q_f32(a3); a3 += 4;
-
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-
-      const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c2, va0, 2);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c2, va1, 2);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c2, va2, 2);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c2, va3, 2);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c2, va0, 2);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c2, va1, 2);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c2, va2, 2);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c2, va3, 2);
-      #else
-        const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0);
-        const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0);
-        const float32x4_t va2c2 = vdupq_lane_f32(vget_high_f32(va2), 0);
-        const float32x4_t va3c2 = vdupq_lane_f32(vget_high_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c2, vb0123c2);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c2, vb0123c2);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c2, vb0123c2);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c2, vb0123c2);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c2, vb4567c2);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c2, vb4567c2);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c2, vb4567c2);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c2, vb4567c2);
-      #endif
-
-      const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c3, va0, 3);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c3, va1, 3);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c3, va2, 3);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c3, va3, 3);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c3, va0, 3);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c3, va1, 3);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c3, va2, 3);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c3, va3, 3);
-      #else
-        const float32x4_t va0c3 = vdupq_lane_f32(vget_high_f32(va0), 1);
-        const float32x4_t va1c3 = vdupq_lane_f32(vget_high_f32(va1), 1);
-        const float32x4_t va2c3 = vdupq_lane_f32(vget_high_f32(va2), 1);
-        const float32x4_t va3c3 = vdupq_lane_f32(vget_high_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c3, vb0123c3);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c3, vb0123c3);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c3, vb0123c3);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c3, vb0123c3);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c3, vb4567c3);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c3, vb4567c3);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c3, vb4567c3);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c3, vb4567c3);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      do {
-        const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-        const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-        const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-        const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-
-        k -= sizeof(float);
-      } while (k != 0);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/4x8-neonfma-ld64.c b/src/f32-gemm/4x8-neonfma-ld64.c
deleted file mode 100644
index c0f9dd5..0000000
--- a/src/f32-gemm/4x8-neonfma-ld64.c
+++ /dev/null
@@ -1,225 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_4x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/5x8-neon-ld64.c b/src/f32-gemm/5x8-neon-lane-ld64.c
similarity index 76%
rename from src/f32-gemm/5x8-neon-ld64.c
rename to src/f32-gemm/5x8-neon-lane-ld64.c
index 4ca0a20..1c46d30 100644
--- a/src/f32-gemm/5x8-neon-ld64.c
+++ b/src/f32-gemm/5x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_5x8__neon_ld64(
+void xnn_f32_gemm_ukernel_5x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -85,29 +86,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -119,16 +120,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/5x8-neon-ld64.c b/src/f32-gemm/5x8-neonfma-lane-ld64.c
similarity index 76%
copy from src/f32-gemm/5x8-neon-ld64.c
copy to src/f32-gemm/5x8-neonfma-lane-ld64.c
index 4ca0a20..4b2d2b8 100644
--- a/src/f32-gemm/5x8-neon-ld64.c
+++ b/src/f32-gemm/5x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_5x8__neon_ld64(
+void xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -85,29 +86,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -119,16 +120,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/5x8-neonfma-ld64.c b/src/f32-gemm/5x8-neonfma-ld64.c
deleted file mode 100644
index 515db66..0000000
--- a/src/f32-gemm/5x8-neonfma-ld64.c
+++ /dev/null
@@ -1,261 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_5x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 5);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-    float32x4_t vacc4x0123 = vacc0x0123;
-    float32x4_t vacc4x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/6x8-neon-ld64.c b/src/f32-gemm/6x8-neon-lane-ld64.c
similarity index 75%
rename from src/f32-gemm/6x8-neon-ld64.c
rename to src/f32-gemm/6x8-neon-lane-ld64.c
index b2322d8..e6ca249 100644
--- a/src/f32-gemm/6x8-neon-ld64.c
+++ b/src/f32-gemm/6x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_6x8__neon_ld64(
+void xnn_f32_gemm_ukernel_6x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -94,33 +95,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -133,18 +134,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/6x8-neon-ld64.c b/src/f32-gemm/6x8-neonfma-lane-ld64.c
similarity index 75%
copy from src/f32-gemm/6x8-neon-ld64.c
copy to src/f32-gemm/6x8-neonfma-lane-ld64.c
index b2322d8..0a34f44 100644
--- a/src/f32-gemm/6x8-neon-ld64.c
+++ b/src/f32-gemm/6x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_6x8__neon_ld64(
+void xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -94,33 +95,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -133,18 +134,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/6x8-neonfma-ld64.c b/src/f32-gemm/6x8-neonfma-ld64.c
deleted file mode 100644
index 682dd07..0000000
--- a/src/f32-gemm/6x8-neonfma-ld64.c
+++ /dev/null
@@ -1,297 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_6x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 6);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
-  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-    float32x4_t vacc4x0123 = vacc0x0123;
-    float32x4_t vacc4x4567 = vacc0x4567;
-    float32x4_t vacc5x0123 = vacc0x0123;
-    float32x4_t vacc5x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-      const float32x2_t va5 = vld1_f32(a5); a5 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        const float32x4_t va5c0 = vdupq_lane_f32(va5, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        const float32x4_t va5c1 = vdupq_lane_f32(va5, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-      const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vfmaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vfmaq_f32(vacc5x4567, va5,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc5x0123 = vminq_f32(vacc5x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-    vacc5x4567 = vminq_f32(vacc5x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-    vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c5, vacc5x0123);
-      vst1q_f32(c5 + 4, vacc5x4567);
-      c5 = (float*) ((uintptr_t) c5 + cn_stride);
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a5 = (const float*) ((uintptr_t) a5 - kc);
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c5, vacc5x0123); c5 += 4;
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc5x0123 = vacc5x4567;
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c5, vacc5x01); c5 += 2;
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc5x01 = vget_high_f32(vacc5x0123);
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c5, vacc5x01, 0);
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/MRx2-neon-ld64.c.in b/src/f32-gemm/MRx2-neon-ld64.c.in
index 0b94074..718d410 100644
--- a/src/f32-gemm/MRx2-neon-ld64.c.in
+++ b/src/f32-gemm/MRx2-neon-ld64.c.in
@@ -11,7 +11,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld64(
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-gemm/neon-ld128.c.in b/src/f32-gemm/neon-ld128.c.in
index b6cad3c..ecee857 100644
--- a/src/f32-gemm/neon-ld128.c.in
+++ b/src/f32-gemm/neon-ld128.c.in
@@ -5,6 +5,9 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -12,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld128(
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_${"dup" if DUP else "lane"}_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -81,22 +84,16 @@
         $for N in range(0, NR, 4):
           const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if FMA:
-          #if defined(__aarch64__)
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_laneq_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, va${M}, ${L});
-          #else
+        $if DUP:
+          $for M in range(MR):
+            const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L % 2});
+          $for N in range(0, NR, 4):
             $for M in range(MR):
-              const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L %   2});
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]},   va${M}c${L}, vb${ABC[N:N+4]}c${L});
-          #endif
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
         $else:
           $for N in range(0, NR, 4):
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = vmlaq_lane_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, ${VGET_PART_F32}(va${M}), ${L % 2});
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, ${VGET_PART_F32}(va${M}), ${L % 2});
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -108,10 +105,7 @@
 
         $for N in range(0, NR, 4):
           $for M in range(MR):
-            $if FMA:
-              vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
-            $else:
-              vacc${M}x${ABC[N:N+4]} = vmlaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
+            vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemm/neon-ld64.c.in b/src/f32-gemm/neon-ld64.c.in
index 5a89619..76e0d67 100644
--- a/src/f32-gemm/neon-ld64.c.in
+++ b/src/f32-gemm/neon-ld64.c.in
@@ -5,6 +5,9 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -12,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld64(
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_${"dup" if DUP else "lane"}_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -79,22 +82,16 @@
         $for N in range(0, NR, 4):
           const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if FMA:
-          #if defined(__aarch64__)
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_lane_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, va${M}, ${L});
-          #else
-            $for M in range(MR):
-              const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]},   va${M}c${L}, vb${ABC[N:N+4]}c${L});
-          #endif
-        $else:
+        $if DUP:
+          $for M in range(MR):
+            const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
           $for N in range(0, NR, 4):
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = vmlaq_lane_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, va${M}, ${L});
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
+        $else:
+           $for N in range(0, NR, 4):
+             $for M in range(MR):
+               vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, va${M}, ${L});
     }
     if XNN_UNLIKELY(k != 0) {
       $for M in range(MR):
@@ -105,10 +102,7 @@
 
       $for N in range(0, NR, 4):
         $for M in range(MR):
-          $if FMA:
-            vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
-          $else:
-            vacc${M}x${ABC[N:N+4]} = vmlaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
+          vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     $for N in range(0, NR, 4):
diff --git a/src/f32-gemminc/1x8-neon-ld64.c b/src/f32-gemminc/1x8-neon-lane-ld64.c
similarity index 85%
rename from src/f32-gemminc/1x8-neon-ld64.c
rename to src/f32-gemminc/1x8-neon-lane-ld64.c
index a5d04dd..5ddcd4f 100644
--- a/src/f32-gemminc/1x8-neon-ld64.c
+++ b/src/f32-gemminc/1x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_1x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -51,13 +52,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -65,8 +66,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/1x8-neon-ld64.c b/src/f32-gemminc/1x8-neonfma-lane-ld64.c
similarity index 85%
copy from src/f32-gemminc/1x8-neon-ld64.c
copy to src/f32-gemminc/1x8-neonfma-lane-ld64.c
index a5d04dd..ff189db 100644
--- a/src/f32-gemminc/1x8-neon-ld64.c
+++ b/src/f32-gemminc/1x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_1x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -51,13 +52,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -65,8 +66,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/1x8-neonfma-ld64.c b/src/f32-gemminc/1x8-neonfma-ld64.c
deleted file mode 100644
index d67a419..0000000
--- a/src/f32-gemminc/1x8-neonfma-ld64.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_1x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/4x8-neon-ld128.c b/src/f32-gemminc/4x8-neon-lane-ld128.c
similarity index 65%
copy from src/f32-gemminc/4x8-neon-ld128.c
copy to src/f32-gemminc/4x8-neon-lane-ld128.c
index 07d7562..9117e4f 100644
--- a/src/f32-gemminc/4x8-neon-ld128.c
+++ b/src/f32-gemminc/4x8-neon-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld128(
+void xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -79,50 +80,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -134,14 +135,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemminc/4x8-neon-ld64.c b/src/f32-gemminc/4x8-neon-lane-ld64.c
similarity index 77%
copy from src/f32-gemminc/4x8-neon-ld64.c
copy to src/f32-gemminc/4x8-neon-lane-ld64.c
index f7677c3..a301d1e 100644
--- a/src/f32-gemminc/4x8-neon-ld64.c
+++ b/src/f32-gemminc/4x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -78,25 +79,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -107,14 +108,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/4x8-neon-ld128.c b/src/f32-gemminc/4x8-neonfma-lane-ld128.c
similarity index 65%
rename from src/f32-gemminc/4x8-neon-ld128.c
rename to src/f32-gemminc/4x8-neonfma-lane-ld128.c
index 07d7562..b73a0fa 100644
--- a/src/f32-gemminc/4x8-neon-ld128.c
+++ b/src/f32-gemminc/4x8-neonfma-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld128(
+void xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -79,50 +80,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -134,14 +135,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemminc/4x8-neon-ld64.c b/src/f32-gemminc/4x8-neonfma-lane-ld64.c
similarity index 77%
rename from src/f32-gemminc/4x8-neon-ld64.c
rename to src/f32-gemminc/4x8-neonfma-lane-ld64.c
index f7677c3..6e15065 100644
--- a/src/f32-gemminc/4x8-neon-ld64.c
+++ b/src/f32-gemminc/4x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -78,25 +79,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -107,14 +108,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/4x8-neonfma-ld128.c b/src/f32-gemminc/4x8-neonfma-ld128.c
deleted file mode 100644
index 49c074c..0000000
--- a/src/f32-gemminc/4x8-neonfma-ld128.c
+++ /dev/null
@@ -1,287 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld128.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_4x8__neonfma_ld128(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
-      const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
-      const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
-      const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
-      const float32x4_t va3 = vld1q_f32(a3); a3 += 4;
-
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-
-      const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c2, va0, 2);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c2, va1, 2);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c2, va2, 2);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c2, va3, 2);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c2, va0, 2);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c2, va1, 2);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c2, va2, 2);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c2, va3, 2);
-      #else
-        const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0);
-        const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0);
-        const float32x4_t va2c2 = vdupq_lane_f32(vget_high_f32(va2), 0);
-        const float32x4_t va3c2 = vdupq_lane_f32(vget_high_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c2, vb0123c2);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c2, vb0123c2);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c2, vb0123c2);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c2, vb0123c2);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c2, vb4567c2);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c2, vb4567c2);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c2, vb4567c2);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c2, vb4567c2);
-      #endif
-
-      const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c3, va0, 3);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c3, va1, 3);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c3, va2, 3);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c3, va3, 3);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c3, va0, 3);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c3, va1, 3);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c3, va2, 3);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c3, va3, 3);
-      #else
-        const float32x4_t va0c3 = vdupq_lane_f32(vget_high_f32(va0), 1);
-        const float32x4_t va1c3 = vdupq_lane_f32(vget_high_f32(va1), 1);
-        const float32x4_t va2c3 = vdupq_lane_f32(vget_high_f32(va2), 1);
-        const float32x4_t va3c3 = vdupq_lane_f32(vget_high_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c3, vb0123c3);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c3, vb0123c3);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c3, vb0123c3);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c3, vb0123c3);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c3, vb4567c3);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c3, vb4567c3);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c3, vb4567c3);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c3, vb4567c3);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      do {
-        const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-        const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-        const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-        const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-
-        k -= sizeof(float);
-      } while (k != 0);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/4x8-neonfma-ld64.c b/src/f32-gemminc/4x8-neonfma-ld64.c
deleted file mode 100644
index f0eefcf..0000000
--- a/src/f32-gemminc/4x8-neonfma-ld64.c
+++ /dev/null
@@ -1,227 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_4x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/5x8-neon-ld64.c b/src/f32-gemminc/5x8-neon-lane-ld64.c
similarity index 76%
rename from src/f32-gemminc/5x8-neon-ld64.c
rename to src/f32-gemminc/5x8-neon-lane-ld64.c
index f2fe644..cb07b18 100644
--- a/src/f32-gemminc/5x8-neon-ld64.c
+++ b/src/f32-gemminc/5x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_5x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -87,29 +88,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -121,16 +122,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/5x8-neon-ld64.c b/src/f32-gemminc/5x8-neonfma-lane-ld64.c
similarity index 76%
copy from src/f32-gemminc/5x8-neon-ld64.c
copy to src/f32-gemminc/5x8-neonfma-lane-ld64.c
index f2fe644..1efd94c 100644
--- a/src/f32-gemminc/5x8-neon-ld64.c
+++ b/src/f32-gemminc/5x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_5x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -87,29 +88,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -121,16 +122,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/5x8-neonfma-ld64.c b/src/f32-gemminc/5x8-neonfma-ld64.c
deleted file mode 100644
index 6bbada0..0000000
--- a/src/f32-gemminc/5x8-neonfma-ld64.c
+++ /dev/null
@@ -1,263 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_5x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 5);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/6x8-neon-ld64.c b/src/f32-gemminc/6x8-neon-lane-ld64.c
similarity index 76%
copy from src/f32-gemminc/6x8-neon-ld64.c
copy to src/f32-gemminc/6x8-neon-lane-ld64.c
index f657dc4..0dad0fd 100644
--- a/src/f32-gemminc/6x8-neon-ld64.c
+++ b/src/f32-gemminc/6x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_6x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -96,33 +97,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -135,18 +136,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/6x8-neon-ld64.c b/src/f32-gemminc/6x8-neonfma-lane-ld64.c
similarity index 76%
rename from src/f32-gemminc/6x8-neon-ld64.c
rename to src/f32-gemminc/6x8-neonfma-lane-ld64.c
index f657dc4..7c2fe58 100644
--- a/src/f32-gemminc/6x8-neon-ld64.c
+++ b/src/f32-gemminc/6x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_6x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -96,33 +97,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -135,18 +136,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/6x8-neonfma-ld64.c b/src/f32-gemminc/6x8-neonfma-ld64.c
deleted file mode 100644
index 22519ef..0000000
--- a/src/f32-gemminc/6x8-neonfma-ld64.c
+++ /dev/null
@@ -1,299 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_6x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 6);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
-  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc5x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc5x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-      const float32x2_t va5 = vld1_f32(a5); a5 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        const float32x4_t va5c0 = vdupq_lane_f32(va5, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        const float32x4_t va5c1 = vdupq_lane_f32(va5, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-      const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vfmaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vfmaq_f32(vacc5x4567, va5,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc5x0123 = vminq_f32(vacc5x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-    vacc5x4567 = vminq_f32(vacc5x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-    vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c5, vacc5x0123);
-      vst1q_f32(c5 + 4, vacc5x4567);
-      c5 = (float*) ((uintptr_t) c5 + cn_stride);
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a5 = (const float*) ((uintptr_t) a5 - kc);
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c5, vacc5x0123); c5 += 4;
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc5x0123 = vacc5x4567;
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c5, vacc5x01); c5 += 2;
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc5x01 = vget_high_f32(vacc5x0123);
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c5, vacc5x01, 0);
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/1x8-neon-ld64.c b/src/f32-igemm/1x8-neon-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/1x8-neon-ld64.c
rename to src/f32-igemm/1x8-neon-lane-ld64.c
index 72768ed..157b73b 100644
--- a/src/f32-igemm/1x8-neon-ld64.c
+++ b/src/f32-igemm/1x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_1x8__neon_ld64(
+void xnn_f32_igemm_ukernel_1x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -69,6 +70,7 @@
 
         vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
         vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/1x8-neon-ld64.c b/src/f32-igemm/1x8-neonfma-lane-ld64.c
similarity index 86%
copy from src/f32-igemm/1x8-neon-ld64.c
copy to src/f32-igemm/1x8-neonfma-lane-ld64.c
index 72768ed..78095b9 100644
--- a/src/f32-igemm/1x8-neon-ld64.c
+++ b/src/f32-igemm/1x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_1x8__neon_ld64(
+void xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -62,13 +63,14 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -76,8 +78,8 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
       }
       p -= 1 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/1x8-neonfma-ld64.c b/src/f32-igemm/1x8-neonfma-ld64.c
deleted file mode 100644
index b10885c..0000000
--- a/src/f32-igemm/1x8-neonfma-ld64.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_1x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (1 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      a += 1;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c1, vb4567c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
-      }
-      p -= 1 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/4x2-neon-ld64.c b/src/f32-igemm/4x2-neon-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/4x2-neon-ld64.c
rename to src/f32-igemm/4x2-neon-lane-ld64.c
index f6ca89c..cbd75b8 100644
--- a/src/f32-igemm/4x2-neon-ld64.c
+++ b/src/f32-igemm/4x2-neon-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x2__neon_ld64(
+void xnn_f32_igemm_ukernel_4x2__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-igemm/4x2-neonfma-ld64.c b/src/f32-igemm/4x2-neonfma-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/4x2-neonfma-ld64.c
rename to src/f32-igemm/4x2-neonfma-lane-ld64.c
index c675e2e..080f4b5 100644
--- a/src/f32-igemm/4x2-neonfma-ld64.c
+++ b/src/f32-igemm/4x2-neonfma-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x2__neonfma_ld64(
+void xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-igemm/4x4-neon-ld64.c b/src/f32-igemm/4x4-neon-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/4x4-neon-ld64.c
rename to src/f32-igemm/4x4-neon-lane-ld64.c
index 71b37ca..b379470 100644
--- a/src/f32-igemm/4x4-neon-ld64.c
+++ b/src/f32-igemm/4x4-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x4__neon_ld64(
+void xnn_f32_igemm_ukernel_4x4__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -103,6 +104,7 @@
         vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
         vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
         vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/4x4-neon-ld64.c b/src/f32-igemm/4x4-neonfma-lane-ld64.c
similarity index 85%
copy from src/f32-igemm/4x4-neon-ld64.c
copy to src/f32-igemm/4x4-neonfma-lane-ld64.c
index 71b37ca..10992a5 100644
--- a/src/f32-igemm/4x4-neon-ld64.c
+++ b/src/f32-igemm/4x4-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x4__neon_ld64(
+void xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -93,16 +94,17 @@
 
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -112,10 +114,10 @@
 
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/4x4-neonfma-ld64.c b/src/f32-igemm/4x4-neonfma-ld64.c
deleted file mode 100644
index e1649f3..0000000
--- a/src/f32-igemm/4x4-neonfma-ld64.c
+++ /dev/null
@@ -1,195 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_4x4__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (4 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc3x0123 = vacc0x0123;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      a += 4;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-        const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c0, vb0123c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c1, vb0123c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-        const float32x4_t va1 = vld1q_dup_f32(a1);
-        const float32x4_t va2 = vld1q_dup_f32(a2);
-        const float32x4_t va3 = vld1q_dup_f32(a3);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
-      }
-      p -= 4 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-
-    if XNN_LIKELY(nc >= 4) {
-      vst1q_f32(c3, vacc3x0123);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 4;
-    } else {
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/4x8-neon-ld128.c b/src/f32-igemm/4x8-neon-lane-ld128.c
similarity index 66%
rename from src/f32-igemm/4x8-neon-ld128.c
rename to src/f32-igemm/4x8-neon-lane-ld128.c
index cc3e3e3..31aa833 100644
--- a/src/f32-igemm/4x8-neon-ld128.c
+++ b/src/f32-igemm/4x8-neon-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld128(
+void xnn_f32_igemm_ukernel_4x8__neon_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -99,50 +100,50 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
         const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
         const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
       }
       if XNN_UNLIKELY(k != 0) {
         do {
@@ -154,14 +155,14 @@
           const float32x4_t vb0123 = vld1q_f32(w); w += 4;
           const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-          vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-          vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-          vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-          vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-          vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-          vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-          vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-          vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+          vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+          vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+          vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+          vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+          vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+          vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+          vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+          vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
 
           k -= sizeof(float);
         } while (k != 0);
diff --git a/src/f32-igemm/4x8-neon-ld64.c b/src/f32-igemm/4x8-neon-lane-ld64.c
similarity index 99%
rename from src/f32-igemm/4x8-neon-ld64.c
rename to src/f32-igemm/4x8-neon-lane-ld64.c
index 0997247..2fbc77a 100644
--- a/src/f32-igemm/4x8-neon-ld64.c
+++ b/src/f32-igemm/4x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld64(
+void xnn_f32_igemm_ukernel_4x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -117,6 +118,7 @@
         vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
         vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
         vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/4x8-neon-ld128.c b/src/f32-igemm/4x8-neonfma-lane-ld128.c
similarity index 66%
copy from src/f32-igemm/4x8-neon-ld128.c
copy to src/f32-igemm/4x8-neonfma-lane-ld128.c
index cc3e3e3..86d2fc3 100644
--- a/src/f32-igemm/4x8-neon-ld128.c
+++ b/src/f32-igemm/4x8-neonfma-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld128(
+void xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -99,50 +100,50 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
         const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
         const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
       }
       if XNN_UNLIKELY(k != 0) {
         do {
@@ -154,14 +155,14 @@
           const float32x4_t vb0123 = vld1q_f32(w); w += 4;
           const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-          vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-          vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-          vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-          vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-          vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-          vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-          vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-          vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+          vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+          vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+          vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+          vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+          vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+          vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+          vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+          vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
 
           k -= sizeof(float);
         } while (k != 0);
diff --git a/src/f32-igemm/4x8-neon-ld64.c b/src/f32-igemm/4x8-neonfma-lane-ld64.c
similarity index 79%
copy from src/f32-igemm/4x8-neon-ld64.c
copy to src/f32-igemm/4x8-neonfma-lane-ld64.c
index 0997247..5503926 100644
--- a/src/f32-igemm/4x8-neon-ld64.c
+++ b/src/f32-igemm/4x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld64(
+void xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -98,25 +99,26 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -127,14 +129,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/4x8-neonfma-ld128.c b/src/f32-igemm/4x8-neonfma-ld128.c
deleted file mode 100644
index a090a06..0000000
--- a/src/f32-igemm/4x8-neonfma-ld128.c
+++ /dev/null
@@ -1,306 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld128.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_4x8__neonfma_ld128(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (4 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      a += 4;
-
-      size_t k = kc;
-      for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
-        const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
-        const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
-        const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
-        const float32x4_t va3 = vld1q_f32(a3); a3 += 4;
-
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c0, va3, 0);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c0, va0, 0);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c0, va1, 0);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c0, va2, 0);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c0, va3, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        #endif
-
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c1, va3, 1);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c1, va0, 1);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c1, va1, 1);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c1, va2, 1);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c1, va3, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        #endif
-
-        const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c2, va0, 2);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c2, va1, 2);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c2, va2, 2);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c2, va3, 2);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c2, va0, 2);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c2, va1, 2);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c2, va2, 2);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c2, va3, 2);
-        #else
-          const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0);
-          const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0);
-          const float32x4_t va2c2 = vdupq_lane_f32(vget_high_f32(va2), 0);
-          const float32x4_t va3c2 = vdupq_lane_f32(vget_high_f32(va3), 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c2, vb0123c2);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c2, vb0123c2);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c2, vb0123c2);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c2, vb0123c2);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c2, vb4567c2);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c2, vb4567c2);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c2, vb4567c2);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c2, vb4567c2);
-        #endif
-
-        const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c3, va0, 3);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c3, va1, 3);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c3, va2, 3);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c3, va3, 3);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c3, va0, 3);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c3, va1, 3);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c3, va2, 3);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c3, va3, 3);
-        #else
-          const float32x4_t va0c3 = vdupq_lane_f32(vget_high_f32(va0), 1);
-          const float32x4_t va1c3 = vdupq_lane_f32(vget_high_f32(va1), 1);
-          const float32x4_t va2c3 = vdupq_lane_f32(vget_high_f32(va2), 1);
-          const float32x4_t va3c3 = vdupq_lane_f32(vget_high_f32(va3), 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c3, vb0123c3);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c3, vb0123c3);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c3, vb0123c3);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c3, vb0123c3);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c3, vb4567c3);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c3, vb4567c3);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c3, vb4567c3);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c3, vb4567c3);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        do {
-          const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-          const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-          const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-          const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-          const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-          const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-
-          k -= sizeof(float);
-        } while (k != 0);
-      }
-
-      p -= 4 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/4x8-neonfma-ld64.c b/src/f32-igemm/4x8-neonfma-ld64.c
deleted file mode 100644
index 5aff360..0000000
--- a/src/f32-igemm/4x8-neonfma-ld64.c
+++ /dev/null
@@ -1,245 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_4x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (4 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      a += 4;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-        const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c1, vb4567c1);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c1, vb4567c1);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c1, vb4567c1);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c1, vb4567c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-        const float32x4_t va1 = vld1q_dup_f32(a1);
-        const float32x4_t va2 = vld1q_dup_f32(a2);
-        const float32x4_t va3 = vld1q_dup_f32(a3);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
-      }
-      p -= 4 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/6x8-neon-ld64.c b/src/f32-igemm/6x8-neon-lane-ld64.c
similarity index 99%
rename from src/f32-igemm/6x8-neon-ld64.c
rename to src/f32-igemm/6x8-neon-lane-ld64.c
index a4ca689..ec13852 100644
--- a/src/f32-igemm/6x8-neon-ld64.c
+++ b/src/f32-igemm/6x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_6x8__neon_ld64(
+void xnn_f32_igemm_ukernel_6x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -149,6 +150,7 @@
         vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
         vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
         vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/6x8-neon-ld64.c b/src/f32-igemm/6x8-neonfma-lane-ld64.c
similarity index 77%
copy from src/f32-igemm/6x8-neon-ld64.c
copy to src/f32-igemm/6x8-neonfma-lane-ld64.c
index a4ca689..9fd0562 100644
--- a/src/f32-igemm/6x8-neon-ld64.c
+++ b/src/f32-igemm/6x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_6x8__neon_ld64(
+void xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -122,33 +123,34 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-        vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
-        vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
-        vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
-        vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+        vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+        vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+        vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+        vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-        vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
-        vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
-        vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
-        vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+        vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+        vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+        vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+        vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -161,18 +163,18 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
-        vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
-        vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
-        vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
-        vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+        vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+        vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
+        vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
       }
       p -= 6 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/6x8-neonfma-ld64.c b/src/f32-igemm/6x8-neonfma-ld64.c
deleted file mode 100644
index 2f64b47..0000000
--- a/src/f32-igemm/6x8-neonfma-ld64.c
+++ /dev/null
@@ -1,321 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_6x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 6);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (6 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    c3 = c2;
-  }
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    c4 = c3;
-  }
-  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 6) {
-    c5 = c4;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-    float32x4_t vacc4x0123 = vacc0x0123;
-    float32x4_t vacc4x4567 = vacc0x4567;
-    float32x4_t vacc5x0123 = vacc0x0123;
-    float32x4_t vacc5x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      const float* restrict a4 = a[4];
-      assert(a4 != NULL);
-      if XNN_UNPREDICTABLE(a4 != zero) {
-        a4 = (const float*) ((uintptr_t) a4 + a_offset);
-      }
-      const float* restrict a5 = a[5];
-      assert(a5 != NULL);
-      if XNN_UNPREDICTABLE(a5 != zero) {
-        a5 = (const float*) ((uintptr_t) a5 + a_offset);
-      }
-      a += 6;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-        const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-        const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-        const float32x2_t va5 = vld1_f32(a5); a5 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-          vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
-          vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
-          vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
-          vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-          const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-          const float32x4_t va5c0 = vdupq_lane_f32(va5, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c0, vb0123c0);
-          vacc4x0123 = vfmaq_f32(vacc4x0123, va4c0, vb0123c0);
-          vacc5x0123 = vfmaq_f32(vacc5x0123, va5c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0);
-          vacc4x4567 = vfmaq_f32(vacc4x4567, va4c0, vb4567c0);
-          vacc5x4567 = vfmaq_f32(vacc5x4567, va5c0, vb4567c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-          vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
-          vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
-          vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
-          vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-          const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-          const float32x4_t va5c1 = vdupq_lane_f32(va5, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c1, vb0123c1);
-          vacc4x0123 = vfmaq_f32(vacc4x0123, va4c1, vb0123c1);
-          vacc5x0123 = vfmaq_f32(vacc5x0123, va5c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c1, vb4567c1);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c1, vb4567c1);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c1, vb4567c1);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c1, vb4567c1);
-          vacc4x4567 = vfmaq_f32(vacc4x4567, va4c1, vb4567c1);
-          vacc5x4567 = vfmaq_f32(vacc5x4567, va5c1, vb4567c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-        const float32x4_t va1 = vld1q_dup_f32(a1);
-        const float32x4_t va2 = vld1q_dup_f32(a2);
-        const float32x4_t va3 = vld1q_dup_f32(a3);
-        const float32x4_t va4 = vld1q_dup_f32(a4);
-        const float32x4_t va5 = vld1q_dup_f32(a5);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
-        vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
-        vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
-        vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
-        vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
-      }
-      p -= 6 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc5x0123 = vminq_f32(vacc5x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-    vacc5x4567 = vminq_f32(vacc5x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-    vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c5, vacc5x0123);
-      vst1q_f32(c5 + 4, vacc5x4567);
-      c5 = (float*) ((uintptr_t) c5 + cn_stride);
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c5, vacc5x0123); c5 += 4;
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc5x0123 = vacc5x4567;
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c5, vacc5x01); c5 += 2;
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc5x01 = vget_high_f32(vacc5x0123);
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c5, vacc5x01, 0);
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/MRx2-neon-ld64.c.in b/src/f32-igemm/MRx2-neon-ld64.c.in
index 9159f95..10bdeab 100644
--- a/src/f32-igemm/MRx2-neon-ld64.c.in
+++ b/src/f32-igemm/MRx2-neon-ld64.c.in
@@ -11,7 +11,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld64(
+void xnn_f32_igemm_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-igemm/neon-ld128.c.in b/src/f32-igemm/neon-ld128.c.in
index b4b4b8a..472a863 100644
--- a/src/f32-igemm/neon-ld128.c.in
+++ b/src/f32-igemm/neon-ld128.c.in
@@ -5,8 +5,8 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
-$VMULADDQ_F32 = "${VMULADDQ_F32}" if FMA else "${VMULADDQ_F32}"
-$VMULADDQ_LANE_F32 = "${VMULADDQ_LANE_F32}" if FMA else "${VMULADDQ_LANE_F32}"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
 
 #include <assert.h>
 
@@ -85,16 +85,16 @@
           $for N in range(0, NR, 4):
             const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if DUP:
-          $for M in range(MR):
-            const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L % 2});
-          $for N in range(0, NR, 4):
+          $if DUP:
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
-        $else:
-          $for N in range(0, NR, 4):
-            $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, va${M}, ${L});
+              const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L % 2});
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
+          $else:
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, ${VGET_PART_F32}(va${M}), ${L % 2});
       }
       if XNN_UNLIKELY(k != 0) {
         do {
@@ -106,10 +106,7 @@
 
           $for N in range(0, NR, 4):
             $for M in range(MR):
-              $if FMA:
-                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
-              $else:
-                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
 
           k -= sizeof(float);
         } while (k != 0);
diff --git a/src/f32-igemm/neon-ld64.c.in b/src/f32-igemm/neon-ld64.c.in
index b816af4..ca93e3e 100644
--- a/src/f32-igemm/neon-ld64.c.in
+++ b/src/f32-igemm/neon-ld64.c.in
@@ -5,8 +5,8 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
-$VMULADDQ_F32 = "${VMULADDQ_F32}" if FMA else "${VMULADDQ_F32}"
-$VMULADDQ_LANE_F32 = "${VMULADDQ_LANE_F32}" if FMA else "${VMULADDQ_LANE_F32}"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
 
 #include <assert.h>
 
@@ -83,16 +83,16 @@
           $for N in range(0, NR, 4):
             const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if DUP:
-          $for M in range(MR):
-            const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
-          $for N in range(0, NR, 4):
+          $if DUP:
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
-        $else:
-           $for N in range(0, NR, 4):
-             $for M in range(MR):
-               vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, va${M}, ${L});
+              const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L},   vb${ABC[N:N+4]}c${L});
+          $else:
+             $for N in range(0, NR, 4):
+               $for M in range(MR):
+                 vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[  N:N+4]}c${L}, va${M}, ${L});
 
       }
       if XNN_UNLIKELY(k != 0) {
@@ -104,10 +104,7 @@
 
         $for N in range(0, NR, 4):
           $for M in range(MR):
-            $if FMA:
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
-            $else:
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
+            vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
       }
       p -= ${MR} * sizeof(void*);
     } while (p != 0);
diff --git a/src/init.c b/src/init.c
index b5795af..bbc162f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -132,16 +132,16 @@
   /**************************** F32 micro-kernels ****************************/
   #ifndef XNN_NO_F32_OPERATORS
     xnn_params.f32.gemm = (struct gemm_parameters) {
-      .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_ld128,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_ld128,
-      .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_ld64,
-      .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_ld64,
+      .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
+      .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+      .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
       .mr = 4,
       .nr = 8,
     };
     xnn_params.f32.gemm2 = (struct gemm_parameters) {
       .gemm = NULL,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neon_ld64,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neon_lane_ld64,
       .mr = 4,
       .nr = 2,
     };
@@ -372,8 +372,8 @@
           break;
         default:
           xnn_params.f32.gemm = (struct gemm_parameters) {
-            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_ld64,
-            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
+            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
             .mr = 6,
@@ -383,10 +383,10 @@
       }
     #else  // XNN_ENABLE_ASSEMBLY
       xnn_params.f32.gemm = (struct gemm_parameters) {
-        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_ld64,
-        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
-        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
         .mr = 6,
         .nr = 8,
       };
@@ -394,7 +394,7 @@
 
     xnn_params.f32.gemm2 = (struct gemm_parameters) {
       .gemm = NULL,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neonfma_ld64,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64,
       .mr = 4,
       .nr = 2,
     };
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 4f16309..bf801e1 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -39,8 +39,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_dup)
@@ -51,8 +51,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__sse)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
@@ -62,10 +62,10 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_dup)
@@ -77,8 +77,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
@@ -87,8 +87,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neon)
@@ -122,8 +122,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_dup)
@@ -142,10 +142,10 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_dup)
@@ -157,8 +157,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
@@ -167,8 +167,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neon)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 5faf766..6ce4e92 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -41,8 +41,8 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_dup)
@@ -53,22 +53,22 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__sse)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__psimd)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__sse)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_dup)
@@ -86,8 +86,8 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neon)
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index d42b8bc..49e1f70 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -8315,7 +8315,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_1X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8325,10 +8325,10 @@
       .m(1)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8339,10 +8339,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8353,10 +8353,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8369,12 +8369,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -8386,11 +8386,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8402,11 +8402,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8417,11 +8417,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8433,11 +8433,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8451,13 +8451,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8468,11 +8468,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8484,11 +8484,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8502,13 +8502,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8519,11 +8519,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8535,11 +8535,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8553,13 +8553,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8571,12 +8571,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8589,12 +8589,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8607,12 +8607,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8626,13 +8626,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8644,12 +8644,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8662,12 +8662,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8680,12 +8680,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8699,13 +8699,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8720,13 +8720,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, qmin) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8737,10 +8737,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, qmax) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8751,10 +8751,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8765,13 +8765,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_4X2__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8781,10 +8781,10 @@
       .m(4)
       .n(2)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, strided_cn) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8795,10 +8795,10 @@
       .n(2)
       .k(2)
       .cn_stride(5)
-      .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8809,10 +8809,10 @@
       .n(2)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 2; n++) {
@@ -8825,12 +8825,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -8842,11 +8842,11 @@
         .n(2)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 2; n++) {
       GemmMicrokernelTester()
@@ -8858,11 +8858,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8873,11 +8873,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8889,11 +8889,11 @@
         .n(2)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8907,13 +8907,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8924,11 +8924,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8940,11 +8940,11 @@
         .n(2)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8958,13 +8958,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_div_2) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8975,11 +8975,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8991,11 +8991,11 @@
         .n(2)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9009,13 +9009,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_gt_2) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9027,12 +9027,12 @@
           .m(4)
           .n(2)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_gt_2_strided_cn) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9045,12 +9045,12 @@
           .n(2)
           .k(k)
           .cn_stride(5)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_gt_2_strided_a) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9063,12 +9063,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_gt_2_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9082,13 +9082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_div_2) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9100,12 +9100,12 @@
           .m(4)
           .n(2)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_div_2_strided_cn) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9118,12 +9118,12 @@
           .n(n)
           .k(k)
           .cn_stride(5)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_div_2_strided_a) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9136,12 +9136,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, n_div_2_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9155,13 +9155,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9176,13 +9176,13 @@
             .k(k)
             .cm_stride(5)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, qmin) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9193,10 +9193,10 @@
       .n(2)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, qmax) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9207,10 +9207,10 @@
       .n(2)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X2__NEON_LD64, strided_cm) {
+  TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9221,13 +9221,13 @@
       .n(2)
       .k(2)
       .cm_stride(5)
-      .Test(xnn_f32_gemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_4X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9237,10 +9237,10 @@
       .m(4)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9251,10 +9251,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9265,10 +9265,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9281,12 +9281,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -9298,11 +9298,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9314,11 +9314,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -9329,11 +9329,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -9345,11 +9345,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9363,13 +9363,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9380,11 +9380,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9396,11 +9396,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9414,13 +9414,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -9431,11 +9431,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -9447,11 +9447,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9465,13 +9465,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9483,12 +9483,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9501,12 +9501,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9519,12 +9519,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9538,13 +9538,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9556,12 +9556,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9574,12 +9574,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9592,12 +9592,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9611,13 +9611,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9632,13 +9632,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, qmin) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9649,10 +9649,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, qmax) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9663,10 +9663,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9677,13 +9677,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_4X8__NEON_LD128, k_eq_4) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9693,10 +9693,10 @@
       .m(4)
       .n(8)
       .k(4)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, strided_cn) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9707,10 +9707,10 @@
       .n(8)
       .k(4)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_eq_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9721,10 +9721,10 @@
       .n(8)
       .k(4)
       .a_stride(7)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_eq_4_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9737,12 +9737,12 @@
           .n(n)
           .k(4)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_eq_4_subtile_m) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -9754,11 +9754,11 @@
         .n(8)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_eq_4_subtile_n) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9770,11 +9770,11 @@
         .n(n)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_lt_4) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9785,11 +9785,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_lt_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9801,11 +9801,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_lt_4_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9819,13 +9819,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_gt_4) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9836,11 +9836,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_gt_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9852,11 +9852,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_gt_4_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9870,13 +9870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_div_4) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -9887,11 +9887,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_div_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -9903,11 +9903,11 @@
         .n(8)
         .k(k)
         .a_stride(43)
-        .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, k_div_4_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9921,13 +9921,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_gt_8) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9939,12 +9939,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9957,12 +9957,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_gt_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9975,12 +9975,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_gt_8_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9994,13 +9994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_div_8) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -10012,12 +10012,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_div_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -10030,12 +10030,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_div_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -10048,12 +10048,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, n_div_8_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -10067,13 +10067,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, strided_cm_subtile) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10088,13 +10088,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, qmin) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10105,10 +10105,10 @@
       .n(8)
       .k(4)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, qmax) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10119,10 +10119,10 @@
       .n(8)
       .k(4)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEON_LD128, strided_cm) {
+  TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -10133,13 +10133,13 @@
       .n(8)
       .k(4)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_5X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10149,10 +10149,10 @@
       .m(5)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10163,10 +10163,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10177,10 +10177,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 5; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10193,12 +10193,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 5; m++) {
       GemmMicrokernelTester()
@@ -10210,11 +10210,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10226,11 +10226,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10241,11 +10241,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10257,11 +10257,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -10275,13 +10275,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10292,11 +10292,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10308,11 +10308,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -10326,13 +10326,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10343,11 +10343,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10359,11 +10359,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -10377,13 +10377,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10395,12 +10395,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10413,12 +10413,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10431,12 +10431,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10450,13 +10450,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10468,12 +10468,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10486,12 +10486,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10504,12 +10504,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10523,13 +10523,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -10544,13 +10544,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, qmin) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10561,10 +10561,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, qmax) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10575,10 +10575,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10589,13 +10589,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_6X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10605,10 +10605,10 @@
       .m(6)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10619,10 +10619,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10633,10 +10633,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10649,12 +10649,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
@@ -10666,11 +10666,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10682,11 +10682,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10697,11 +10697,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10713,11 +10713,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10731,13 +10731,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10748,11 +10748,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10764,11 +10764,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10782,13 +10782,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10799,11 +10799,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10815,11 +10815,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10833,13 +10833,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10851,12 +10851,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10869,12 +10869,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10887,12 +10887,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10906,13 +10906,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10924,12 +10924,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10942,12 +10942,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10960,12 +10960,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10979,13 +10979,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -11000,13 +11000,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, qmin) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -11017,10 +11017,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, qmax) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -11031,10 +11031,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -11045,7 +11045,7 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -12874,8 +12874,8 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12885,10 +12885,10 @@
       .m(1)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12899,10 +12899,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12913,10 +12913,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12929,12 +12929,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -12946,11 +12946,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12962,11 +12962,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12977,11 +12977,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12993,11 +12993,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13011,13 +13011,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13028,11 +13028,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13044,11 +13044,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13062,13 +13062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13079,11 +13079,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13095,11 +13095,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13113,13 +13113,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13131,12 +13131,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13149,12 +13149,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13167,12 +13167,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13186,13 +13186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13204,12 +13204,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13222,12 +13222,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13240,12 +13240,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13259,13 +13259,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -13280,13 +13280,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -13297,10 +13297,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -13311,10 +13311,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_1X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -13325,13 +13325,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13341,10 +13341,10 @@
       .m(4)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13355,10 +13355,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13369,10 +13369,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -13385,12 +13385,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -13402,11 +13402,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13418,11 +13418,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -13433,11 +13433,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -13449,11 +13449,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13467,13 +13467,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13484,11 +13484,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13500,11 +13500,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13518,13 +13518,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13535,11 +13535,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13551,11 +13551,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13569,13 +13569,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13587,12 +13587,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13605,12 +13605,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13623,12 +13623,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13642,13 +13642,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13660,12 +13660,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13678,12 +13678,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13696,12 +13696,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13715,13 +13715,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13736,13 +13736,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13753,10 +13753,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13767,10 +13767,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13781,13 +13781,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_eq_4) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13797,10 +13797,10 @@
       .m(4)
       .n(8)
       .k(4)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, strided_cn) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13811,10 +13811,10 @@
       .n(8)
       .k(4)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_eq_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13825,10 +13825,10 @@
       .n(8)
       .k(4)
       .a_stride(7)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_eq_4_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -13841,12 +13841,12 @@
           .n(n)
           .k(4)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_eq_4_subtile_m) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -13858,11 +13858,11 @@
         .n(8)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_eq_4_subtile_n) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13874,11 +13874,11 @@
         .n(n)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_lt_4) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13889,11 +13889,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_lt_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13905,11 +13905,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_lt_4_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13923,13 +13923,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_gt_4) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13940,11 +13940,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_gt_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13956,11 +13956,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_gt_4_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13974,13 +13974,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_div_4) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -13991,11 +13991,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_div_4_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -14007,11 +14007,11 @@
         .n(8)
         .k(k)
         .a_stride(43)
-        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, k_div_4_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14025,13 +14025,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_gt_8) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14043,12 +14043,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14061,12 +14061,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_gt_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14079,12 +14079,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_gt_8_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14098,13 +14098,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_div_8) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14116,12 +14116,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_div_8_strided_cn) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14134,12 +14134,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_div_8_strided_a) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14152,12 +14152,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, n_div_8_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -14171,13 +14171,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, strided_cm_subtile) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -14192,13 +14192,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, qmin) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -14209,10 +14209,10 @@
       .n(8)
       .k(4)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, qmax) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -14223,10 +14223,10 @@
       .n(8)
       .k(4)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMM_4X8__NEONFMA_LD128, strided_cm) {
+  TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -14237,13 +14237,13 @@
       .n(8)
       .k(4)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14253,10 +14253,10 @@
       .m(5)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14267,10 +14267,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14281,10 +14281,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 5; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -14297,12 +14297,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 5; m++) {
       GemmMicrokernelTester()
@@ -14314,11 +14314,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -14330,11 +14330,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -14345,11 +14345,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -14361,11 +14361,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -14379,13 +14379,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -14396,11 +14396,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -14412,11 +14412,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -14430,13 +14430,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14447,11 +14447,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14463,11 +14463,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -14481,13 +14481,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14499,12 +14499,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14517,12 +14517,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14535,12 +14535,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14554,13 +14554,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14572,12 +14572,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14590,12 +14590,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14608,12 +14608,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14627,13 +14627,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -14648,13 +14648,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14665,10 +14665,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14679,10 +14679,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_5X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14693,13 +14693,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14709,10 +14709,10 @@
       .m(6)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14723,10 +14723,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14737,10 +14737,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -14753,12 +14753,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
@@ -14770,11 +14770,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -14786,11 +14786,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -14801,11 +14801,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -14817,11 +14817,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14835,13 +14835,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -14852,11 +14852,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -14868,11 +14868,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14886,13 +14886,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14903,11 +14903,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14919,11 +14919,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14937,13 +14937,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14955,12 +14955,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14973,12 +14973,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14991,12 +14991,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -15010,13 +15010,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -15028,12 +15028,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -15046,12 +15046,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -15064,12 +15064,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -15083,13 +15083,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -15104,13 +15104,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -15121,10 +15121,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -15135,10 +15135,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMM_6X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -15149,9 +15149,9 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index 2b42de5..56b93fa 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -65,17 +65,17 @@
 - name: xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128
   k-block: 4
   assembly: true
-- name: xnn_f32_gemm_ukernel_1x8__neon_ld64
+- name: xnn_f32_gemm_ukernel_1x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_4x2__neon_ld64
+- name: xnn_f32_gemm_ukernel_4x2__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_4x8__neon_ld64
+- name: xnn_f32_gemm_ukernel_4x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_4x8__neon_ld128
+- name: xnn_f32_gemm_ukernel_4x8__neon_lane_ld128
   k-block: 4
-- name: xnn_f32_gemm_ukernel_5x8__neon_ld64
+- name: xnn_f32_gemm_ukernel_5x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_6x8__neon_ld64
+- name: xnn_f32_gemm_ukernel_6x8__neon_lane_ld64
   k-block: 2
 - name: xnn_f32_gemm_ukernel_1x8s4__neon
   k-block: 4
@@ -85,16 +85,26 @@
   k-block: 4
 - name: xnn_f32_gemm_ukernel_8x8s4__neon
   k-block: 4
-- name: xnn_f32_gemm_ukernel_1x8__neonfma_ld64
+- name: xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_4x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_4x8__neonfma_ld128
+  arch:
+    - aarch64
+- name: xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128
   k-block: 4
-- name: xnn_f32_gemm_ukernel_5x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_gemm_ukernel_6x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64
   k-block: 2
+  arch:
+    - aarch64
 - name: xnn_f32_gemm_ukernel_1x8s4__neonfma
   k-block: 4
 - name: xnn_f32_gemm_ukernel_4x8s4__neonfma
diff --git a/test/f32-gemminc.cc b/test/f32-gemminc.cc
index 19c7d5d..fbab1c5 100644
--- a/test/f32-gemminc.cc
+++ b/test/f32-gemminc.cc
@@ -8315,7 +8315,7 @@
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8325,10 +8325,10 @@
       .m(1)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8339,10 +8339,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8353,10 +8353,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8369,12 +8369,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -8386,11 +8386,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8402,11 +8402,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8417,11 +8417,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8433,11 +8433,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8451,13 +8451,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8468,11 +8468,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8484,11 +8484,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8502,13 +8502,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8519,11 +8519,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8535,11 +8535,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8553,13 +8553,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8571,12 +8571,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8589,12 +8589,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8607,12 +8607,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8626,13 +8626,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8644,12 +8644,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8662,12 +8662,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8680,12 +8680,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8699,13 +8699,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -8720,13 +8720,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, qmin) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8737,10 +8737,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, qmax) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8751,10 +8751,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -8765,13 +8765,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8781,10 +8781,10 @@
       .m(4)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8795,10 +8795,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8809,10 +8809,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8825,12 +8825,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -8842,11 +8842,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8858,11 +8858,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8873,11 +8873,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8889,11 +8889,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8907,13 +8907,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8924,11 +8924,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8940,11 +8940,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8958,13 +8958,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8975,11 +8975,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8991,11 +8991,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9009,13 +9009,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9027,12 +9027,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9045,12 +9045,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9063,12 +9063,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9082,13 +9082,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9100,12 +9100,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9118,12 +9118,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9136,12 +9136,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9155,13 +9155,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9176,13 +9176,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, qmin) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9193,10 +9193,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, qmax) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9207,10 +9207,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9221,13 +9221,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_eq_4) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9237,10 +9237,10 @@
       .m(4)
       .n(8)
       .k(4)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9251,10 +9251,10 @@
       .n(8)
       .k(4)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_eq_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9265,10 +9265,10 @@
       .n(8)
       .k(4)
       .a_stride(7)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_eq_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9281,12 +9281,12 @@
           .n(n)
           .k(4)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_eq_4_subtile_m) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -9298,11 +9298,11 @@
         .n(8)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_eq_4_subtile_n) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9314,11 +9314,11 @@
         .n(n)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_lt_4) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9329,11 +9329,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_lt_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9345,11 +9345,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_lt_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9363,13 +9363,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_gt_4) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9380,11 +9380,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_gt_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -9396,11 +9396,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_gt_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9414,13 +9414,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_div_4) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -9431,11 +9431,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_div_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -9447,11 +9447,11 @@
         .n(8)
         .k(k)
         .a_stride(43)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, k_div_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9465,13 +9465,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_gt_8) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9483,12 +9483,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9501,12 +9501,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9519,12 +9519,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9538,13 +9538,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_div_8) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9556,12 +9556,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9574,12 +9574,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9592,12 +9592,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, n_div_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -9611,13 +9611,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, strided_cm_subtile) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -9632,13 +9632,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, qmin) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9649,10 +9649,10 @@
       .n(8)
       .k(4)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, qmax) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9663,10 +9663,10 @@
       .n(8)
       .k(4)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEON_LD128, strided_cm) {
+  TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -9677,13 +9677,13 @@
       .n(8)
       .k(4)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -9693,10 +9693,10 @@
       .m(5)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -9707,10 +9707,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -9721,10 +9721,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 5; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -9737,12 +9737,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 5; m++) {
       GemmMicrokernelTester()
@@ -9754,11 +9754,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -9770,11 +9770,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -9785,11 +9785,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -9801,11 +9801,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -9819,13 +9819,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9836,11 +9836,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -9852,11 +9852,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -9870,13 +9870,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -9887,11 +9887,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -9903,11 +9903,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -9921,13 +9921,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9939,12 +9939,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9957,12 +9957,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9975,12 +9975,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -9994,13 +9994,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10012,12 +10012,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10030,12 +10030,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10048,12 +10048,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10067,13 +10067,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -10088,13 +10088,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, qmin) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10105,10 +10105,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, qmax) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10119,10 +10119,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(5)
@@ -10133,13 +10133,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_eq_2) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10149,10 +10149,10 @@
       .m(6)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, strided_cn) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10163,10 +10163,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10177,10 +10177,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -10193,12 +10193,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
@@ -10210,11 +10210,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -10226,11 +10226,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10241,11 +10241,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10257,11 +10257,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10275,13 +10275,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10292,11 +10292,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10308,11 +10308,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10326,13 +10326,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_div_2) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10343,11 +10343,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10359,11 +10359,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10377,13 +10377,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10395,12 +10395,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10413,12 +10413,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10431,12 +10431,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10450,13 +10450,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_div_8) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10468,12 +10468,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10486,12 +10486,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10504,12 +10504,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10523,13 +10523,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -10544,13 +10544,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, qmin) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10561,10 +10561,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, qmax) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10575,10 +10575,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEON_LD64, strided_cm) {
+  TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -10589,7 +10589,7 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -12418,8 +12418,8 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12429,10 +12429,10 @@
       .m(1)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12443,10 +12443,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12457,10 +12457,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12473,12 +12473,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -12490,11 +12490,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12506,11 +12506,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12521,11 +12521,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12537,11 +12537,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12555,13 +12555,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -12572,11 +12572,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -12588,11 +12588,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12606,13 +12606,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -12623,11 +12623,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -12639,11 +12639,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12657,13 +12657,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12675,12 +12675,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12693,12 +12693,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12711,12 +12711,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12730,13 +12730,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12748,12 +12748,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12766,12 +12766,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12784,12 +12784,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12803,13 +12803,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -12824,13 +12824,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12841,10 +12841,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12855,10 +12855,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_1X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(1)
@@ -12869,13 +12869,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12885,10 +12885,10 @@
       .m(4)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12899,10 +12899,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12913,10 +12913,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12929,12 +12929,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -12946,11 +12946,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12962,11 +12962,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12977,11 +12977,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12993,11 +12993,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13011,13 +13011,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13028,11 +13028,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13044,11 +13044,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13062,13 +13062,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13079,11 +13079,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13095,11 +13095,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13113,13 +13113,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13131,12 +13131,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13149,12 +13149,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13167,12 +13167,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13186,13 +13186,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13204,12 +13204,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13222,12 +13222,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13240,12 +13240,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -13259,13 +13259,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13280,13 +13280,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13297,10 +13297,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13311,10 +13311,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13325,13 +13325,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_eq_4) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13341,10 +13341,10 @@
       .m(4)
       .n(8)
       .k(4)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13355,10 +13355,10 @@
       .n(8)
       .k(4)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_eq_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13369,10 +13369,10 @@
       .n(8)
       .k(4)
       .a_stride(7)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_eq_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -13385,12 +13385,12 @@
           .n(n)
           .k(4)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_eq_4_subtile_m) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -13402,11 +13402,11 @@
         .n(8)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_eq_4_subtile_n) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13418,11 +13418,11 @@
         .n(n)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_lt_4) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13433,11 +13433,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_lt_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13449,11 +13449,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_lt_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13467,13 +13467,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_gt_4) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13484,11 +13484,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_gt_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -13500,11 +13500,11 @@
         .n(8)
         .k(k)
         .a_stride(11)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_gt_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13518,13 +13518,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_div_4) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -13535,11 +13535,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_div_4_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -13551,11 +13551,11 @@
         .n(8)
         .k(k)
         .a_stride(43)
-        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, k_div_4_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13569,13 +13569,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_gt_8) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13587,12 +13587,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13605,12 +13605,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13623,12 +13623,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13642,13 +13642,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_div_8) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13660,12 +13660,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13678,12 +13678,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13696,12 +13696,12 @@
           .n(n)
           .k(k)
           .a_stride(23)
-          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, n_div_8_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -13715,13 +13715,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, strided_cm_subtile) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -13736,13 +13736,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, qmin) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13753,10 +13753,10 @@
       .n(8)
       .k(4)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, qmax) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13767,10 +13767,10 @@
       .n(8)
       .k(4)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_GEMMINC_4X8__NEONFMA_LD128, strided_cm) {
+  TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -13781,13 +13781,13 @@
       .n(8)
       .k(4)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -13797,10 +13797,10 @@
       .m(5)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -13811,10 +13811,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -13825,10 +13825,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 5; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -13841,12 +13841,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 5; m++) {
       GemmMicrokernelTester()
@@ -13858,11 +13858,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -13874,11 +13874,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -13889,11 +13889,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -13905,11 +13905,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -13923,13 +13923,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13940,11 +13940,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -13956,11 +13956,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -13974,13 +13974,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -13991,11 +13991,11 @@
         .m(5)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14007,11 +14007,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -14025,13 +14025,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14043,12 +14043,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14061,12 +14061,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14079,12 +14079,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14098,13 +14098,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14116,12 +14116,12 @@
           .m(5)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14134,12 +14134,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14152,12 +14152,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14171,13 +14171,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 5; m++) {
@@ -14192,13 +14192,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14209,10 +14209,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14223,10 +14223,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_5X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(5)
@@ -14237,13 +14237,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14253,10 +14253,10 @@
       .m(6)
       .n(8)
       .k(2)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14267,10 +14267,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14281,10 +14281,10 @@
       .n(8)
       .k(2)
       .a_stride(5)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -14297,12 +14297,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
@@ -14314,11 +14314,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -14330,11 +14330,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -14345,11 +14345,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -14361,11 +14361,11 @@
         .n(8)
         .k(k)
         .a_stride(5)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14379,13 +14379,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -14396,11 +14396,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -14412,11 +14412,11 @@
         .n(8)
         .k(k)
         .a_stride(7)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14430,13 +14430,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14447,11 +14447,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -14463,11 +14463,11 @@
         .n(8)
         .k(k)
         .a_stride(23)
-        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14481,13 +14481,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14499,12 +14499,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14517,12 +14517,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_gt_8_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14535,12 +14535,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14554,13 +14554,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14572,12 +14572,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14590,12 +14590,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_div_8_strided_a) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14608,12 +14608,12 @@
           .n(n)
           .k(k)
           .a_stride(13)
-          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -14627,13 +14627,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -14648,13 +14648,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, qmin) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14665,10 +14665,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, qmax) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14679,10 +14679,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_GEMMINC_6X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -14693,9 +14693,9 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index eae6d5c..57409cf 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -65,15 +65,15 @@
 - name: xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
   k-block: 4
   assembly: true
-- name: xnn_f32_gemminc_ukernel_1x8__neon_ld64
+- name: xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemminc_ukernel_4x8__neon_ld64
+- name: xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemminc_ukernel_4x8__neon_ld128
+- name: xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128
   k-block: 4
-- name: xnn_f32_gemminc_ukernel_5x8__neon_ld64
+- name: xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_gemminc_ukernel_6x8__neon_ld64
+- name: xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64
   k-block: 2
 - name: xnn_f32_gemminc_ukernel_1x8s4__neon
   k-block: 4
@@ -83,16 +83,26 @@
   k-block: 4
 - name: xnn_f32_gemminc_ukernel_8x8s4__neon
   k-block: 4
-- name: xnn_f32_gemminc_ukernel_1x8__neonfma_ld64
+- name: xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_gemminc_ukernel_4x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_gemminc_ukernel_4x8__neonfma_ld128
+  arch:
+    - aarch64
+- name: xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128
   k-block: 4
-- name: xnn_f32_gemminc_ukernel_5x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_gemminc_ukernel_6x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64
   k-block: 2
+  arch:
+    - aarch64
 - name: xnn_f32_gemminc_ukernel_1x8s4__neonfma
   k-block: 4
 - name: xnn_f32_gemminc_ukernel_4x8s4__neonfma
diff --git a/test/f32-igemm.cc b/test/f32-igemm.cc
index aebe011..f9ad11b 100644
--- a/test/f32-igemm.cc
+++ b/test/f32-igemm.cc
@@ -5979,8 +5979,8 @@
 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -5990,10 +5990,10 @@
       .m(1)
       .n(8)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, strided_cn) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6004,10 +6004,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -6020,12 +6020,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 1; m++) {
       GemmMicrokernelTester()
@@ -6037,11 +6037,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -6053,11 +6053,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_lt_2) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -6068,11 +6068,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6086,13 +6086,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_gt_2) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -6103,11 +6103,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6121,13 +6121,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_div_2) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -6138,11 +6138,11 @@
         .m(1)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6156,13 +6156,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_gt_8) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6174,12 +6174,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6192,12 +6192,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6211,13 +6211,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_div_8) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6229,12 +6229,12 @@
           .m(1)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6247,12 +6247,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6266,13 +6266,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, small_kernel) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -6284,11 +6284,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6303,13 +6303,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6322,12 +6322,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6340,12 +6340,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 1; m++) {
@@ -6360,13 +6360,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, a_offset) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -6379,11 +6379,11 @@
         .k(k)
         .ks(3)
         .a_offset(13)
-        .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, zero) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 1; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6398,12 +6398,12 @@
           .ks(3)
           .a_offset(13)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, qmin) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6414,10 +6414,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, qmax) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6428,10 +6428,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_1X8__NEON_LD64, strided_cm) {
+  TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(1)
@@ -6442,13 +6442,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_1x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_eq_2) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6458,10 +6458,10 @@
       .m(4)
       .n(2)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, strided_cn) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6472,10 +6472,10 @@
       .n(2)
       .k(2)
       .cn_stride(5)
-      .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 2; n++) {
@@ -6488,12 +6488,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -6505,11 +6505,11 @@
         .n(2)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 2; n++) {
       GemmMicrokernelTester()
@@ -6521,11 +6521,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_lt_2) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -6536,11 +6536,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6554,13 +6554,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_gt_2) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -6571,11 +6571,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6589,13 +6589,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_div_2) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -6606,11 +6606,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6624,13 +6624,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_gt_2) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6642,12 +6642,12 @@
           .m(4)
           .n(2)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_gt_2_strided_cn) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6660,12 +6660,12 @@
           .n(2)
           .k(k)
           .cn_stride(5)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_gt_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6679,13 +6679,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_div_2) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6697,12 +6697,12 @@
           .m(4)
           .n(2)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_div_2_strided_cn) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6715,12 +6715,12 @@
           .n(n)
           .k(k)
           .cn_stride(5)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_div_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6734,13 +6734,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, small_kernel) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -6752,11 +6752,11 @@
         .n(2)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6771,13 +6771,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_gt_2_small_kernel) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6790,12 +6790,12 @@
           .n(2)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, n_div_2_small_kernel) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6808,12 +6808,12 @@
           .n(2)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -6828,13 +6828,13 @@
             .k(k)
             .cm_stride(5)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, a_offset) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -6847,11 +6847,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, zero) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -6866,12 +6866,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, qmin) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6882,10 +6882,10 @@
       .n(2)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, qmax) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6896,10 +6896,10 @@
       .n(2)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEON_LD64, strided_cm) {
+  TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6910,13 +6910,13 @@
       .n(2)
       .k(2)
       .cm_stride(5)
-      .Test(xnn_f32_igemm_ukernel_4x2__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_eq_2) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6926,10 +6926,10 @@
       .m(4)
       .n(4)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, strided_cn) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -6940,10 +6940,10 @@
       .n(4)
       .k(2)
       .cn_stride(7)
-      .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -6956,12 +6956,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -6973,11 +6973,11 @@
         .n(4)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -6989,11 +6989,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_lt_2) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -7004,11 +7004,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7022,13 +7022,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_gt_2) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -7039,11 +7039,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7057,13 +7057,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_div_2) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -7074,11 +7074,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7092,13 +7092,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_gt_4) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7110,12 +7110,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_gt_4_strided_cn) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7128,12 +7128,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_gt_4_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7147,13 +7147,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_div_4) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7165,12 +7165,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_div_4_strided_cn) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7183,12 +7183,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_div_4_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7202,13 +7202,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, small_kernel) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -7220,11 +7220,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7239,13 +7239,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_gt_4_small_kernel) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7258,12 +7258,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, n_div_4_small_kernel) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7276,12 +7276,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7296,13 +7296,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, a_offset) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -7315,11 +7315,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, zero) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -7334,12 +7334,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, qmin) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7350,10 +7350,10 @@
       .n(4)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, qmax) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7364,10 +7364,10 @@
       .n(4)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEON_LD64, strided_cm) {
+  TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7378,13 +7378,13 @@
       .n(4)
       .k(2)
       .cm_stride(7)
-      .Test(xnn_f32_igemm_ukernel_4x4__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_eq_4) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7394,10 +7394,10 @@
       .m(4)
       .n(8)
       .k(4)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, strided_cn) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7408,10 +7408,10 @@
       .n(8)
       .k(4)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_eq_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7424,12 +7424,12 @@
           .n(n)
           .k(4)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_eq_4_subtile_m) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -7441,11 +7441,11 @@
         .n(8)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_eq_4_subtile_n) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7457,11 +7457,11 @@
         .n(n)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_lt_4) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_lt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -7472,11 +7472,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_lt_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7490,13 +7490,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_gt_4) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_gt_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -7507,11 +7507,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_gt_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 5; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7525,13 +7525,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_div_4) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_div_4) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -7542,11 +7542,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, k_div_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 8; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7560,13 +7560,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_gt_8) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7578,12 +7578,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7596,12 +7596,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_gt_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7615,13 +7615,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_div_8) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7633,12 +7633,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7651,12 +7651,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_div_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7670,13 +7670,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, small_kernel) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 20; k += 5) {
       GemmMicrokernelTester()
@@ -7688,11 +7688,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7707,13 +7707,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7726,12 +7726,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7744,12 +7744,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7764,13 +7764,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, a_offset) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 20; k += 5) {
       GemmMicrokernelTester()
@@ -7783,11 +7783,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, zero) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -7802,12 +7802,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, qmin) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7818,10 +7818,10 @@
       .n(8)
       .k(4)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, qmax) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7832,10 +7832,10 @@
       .n(8)
       .k(4)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD128, strided_cm) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7846,13 +7846,13 @@
       .n(8)
       .k(4)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_eq_2) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7862,10 +7862,10 @@
       .m(4)
       .n(8)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, strided_cn) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -7876,10 +7876,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -7892,12 +7892,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -7909,11 +7909,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -7925,11 +7925,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_lt_2) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -7940,11 +7940,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7958,13 +7958,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_gt_2) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -7975,11 +7975,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -7993,13 +7993,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_div_2) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8010,11 +8010,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8028,13 +8028,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_gt_8) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8046,12 +8046,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8064,12 +8064,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8083,13 +8083,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_div_8) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8101,12 +8101,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8119,12 +8119,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8138,13 +8138,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, small_kernel) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -8156,11 +8156,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8175,13 +8175,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8194,12 +8194,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8212,12 +8212,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -8232,13 +8232,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, a_offset) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -8251,11 +8251,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, zero) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8270,12 +8270,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, qmin) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8286,10 +8286,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, qmax) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8300,10 +8300,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEON_LD64, strided_cm) {
+  TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -8314,13 +8314,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_eq_2) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -8330,10 +8330,10 @@
       .m(6)
       .n(8)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, strided_cn) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -8344,10 +8344,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -8360,12 +8360,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
@@ -8377,11 +8377,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -8393,11 +8393,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_lt_2) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -8408,11 +8408,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -8426,13 +8426,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_gt_2) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -8443,11 +8443,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -8461,13 +8461,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_div_2) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -8478,11 +8478,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -8496,13 +8496,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_gt_8) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8514,12 +8514,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8532,12 +8532,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_gt_8_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8551,13 +8551,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_div_8) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8569,12 +8569,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8587,12 +8587,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_div_8_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8606,13 +8606,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, small_kernel) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -8624,11 +8624,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -8643,13 +8643,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8662,12 +8662,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8680,12 +8680,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -8700,13 +8700,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, a_offset) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -8719,11 +8719,11 @@
         .k(k)
         .ks(3)
         .a_offset(67)
-        .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, zero) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 6; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -8738,12 +8738,12 @@
           .ks(3)
           .a_offset(67)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, qmin) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -8754,10 +8754,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, qmax) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -8768,10 +8768,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEON_LD64, strided_cm) {
+  TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(6)
@@ -8782,7 +8782,7 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_6x8__neon_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
   }
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
@@ -10659,8 +10659,8 @@
 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -10670,10 +10670,10 @@
       .m(4)
       .n(2)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, strided_cn) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -10684,10 +10684,10 @@
       .n(2)
       .k(2)
       .cn_stride(5)
-      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 2; n++) {
@@ -10700,12 +10700,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -10717,11 +10717,11 @@
         .n(2)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 2; n++) {
       GemmMicrokernelTester()
@@ -10733,11 +10733,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -10748,11 +10748,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10766,13 +10766,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -10783,11 +10783,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10801,13 +10801,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_div_2) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -10818,11 +10818,11 @@
         .m(4)
         .n(2)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10836,13 +10836,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_gt_2) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10854,12 +10854,12 @@
           .m(4)
           .n(2)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_gt_2_strided_cn) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10872,12 +10872,12 @@
           .n(2)
           .k(k)
           .cn_stride(5)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_gt_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10891,13 +10891,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_div_2) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10909,12 +10909,12 @@
           .m(4)
           .n(2)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_div_2_strided_cn) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10927,12 +10927,12 @@
           .n(n)
           .k(k)
           .cn_stride(5)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_div_2_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -10946,13 +10946,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, small_kernel) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -10964,11 +10964,11 @@
         .n(2)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -10983,13 +10983,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_gt_2_small_kernel) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 3; n < 4; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11002,12 +11002,12 @@
           .n(2)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, n_div_2_small_kernel) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 4; n <= 6; n += 2) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11020,12 +11020,12 @@
           .n(2)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11040,13 +11040,13 @@
             .k(k)
             .cm_stride(5)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, a_offset) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -11059,11 +11059,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, zero) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11078,12 +11078,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, qmin) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11094,10 +11094,10 @@
       .n(2)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, qmax) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11108,10 +11108,10 @@
       .n(2)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X2__NEONFMA_LD64, strided_cm) {
+  TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11122,13 +11122,13 @@
       .n(2)
       .k(2)
       .cm_stride(5)
-      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11138,10 +11138,10 @@
       .m(4)
       .n(4)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, strided_cn) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11152,10 +11152,10 @@
       .n(4)
       .k(2)
       .cn_stride(7)
-      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 4; n++) {
@@ -11168,12 +11168,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -11185,11 +11185,11 @@
         .n(4)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 4; n++) {
       GemmMicrokernelTester()
@@ -11201,11 +11201,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -11216,11 +11216,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11234,13 +11234,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -11251,11 +11251,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11269,13 +11269,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_div_2) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -11286,11 +11286,11 @@
         .m(4)
         .n(4)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11304,13 +11304,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_gt_4) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11322,12 +11322,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_gt_4_strided_cn) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11340,12 +11340,12 @@
           .n(4)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_gt_4_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11359,13 +11359,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_div_4) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11377,12 +11377,12 @@
           .m(4)
           .n(4)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_div_4_strided_cn) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11395,12 +11395,12 @@
           .n(n)
           .k(k)
           .cn_stride(7)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_div_4_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11414,13 +11414,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, small_kernel) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -11432,11 +11432,11 @@
         .n(4)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11451,13 +11451,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_gt_4_small_kernel) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 5; n < 8; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11470,12 +11470,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, n_div_4_small_kernel) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 8; n <= 12; n += 4) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11488,12 +11488,12 @@
           .n(4)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11508,13 +11508,13 @@
             .k(k)
             .cm_stride(7)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, a_offset) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -11527,11 +11527,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, zero) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -11546,12 +11546,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, qmin) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11562,10 +11562,10 @@
       .n(4)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, qmax) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11576,10 +11576,10 @@
       .n(4)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X4__NEONFMA_LD64, strided_cm) {
+  TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11590,13 +11590,13 @@
       .n(4)
       .k(2)
       .cm_stride(7)
-      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_eq_4) {
+#if XNN_ARCH_ARM64
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11606,10 +11606,10 @@
       .m(4)
       .n(8)
       .k(4)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, strided_cn) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -11620,10 +11620,10 @@
       .n(8)
       .k(4)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_eq_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -11636,12 +11636,12 @@
           .n(n)
           .k(4)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_eq_4_subtile_m) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -11653,11 +11653,11 @@
         .n(8)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_eq_4_subtile_n) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -11669,11 +11669,11 @@
         .n(n)
         .k(4)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_lt_4) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_lt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       GemmMicrokernelTester()
@@ -11684,11 +11684,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_lt_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11702,13 +11702,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_gt_4) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_gt_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       GemmMicrokernelTester()
@@ -11719,11 +11719,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_gt_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 5; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11737,13 +11737,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_div_4) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       GemmMicrokernelTester()
@@ -11754,11 +11754,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, k_div_4_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 8; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11772,13 +11772,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_gt_8) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11790,12 +11790,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11808,12 +11808,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_gt_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11827,13 +11827,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_div_8) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11845,12 +11845,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11863,12 +11863,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_div_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11882,13 +11882,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, small_kernel) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 20; k += 5) {
       GemmMicrokernelTester()
@@ -11900,11 +11900,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11919,13 +11919,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11938,12 +11938,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -11956,12 +11956,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -11976,13 +11976,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, a_offset) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, a_offset) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 20; k += 5) {
       GemmMicrokernelTester()
@@ -11995,11 +11995,11 @@
         .k(k)
         .ks(3)
         .a_offset(83)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, zero) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, zero) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 20; k += 5) {
@@ -12014,12 +12014,12 @@
           .ks(3)
           .a_offset(83)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, qmin) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12030,10 +12030,10 @@
       .n(8)
       .k(4)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, qmax) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12044,10 +12044,10 @@
       .n(8)
       .k(4)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD128, strided_cm) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12058,13 +12058,13 @@
       .n(8)
       .k(4)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld128);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12074,10 +12074,10 @@
       .m(4)
       .n(8)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12088,10 +12088,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12104,12 +12104,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -12121,11 +12121,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12137,11 +12137,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12152,11 +12152,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12170,13 +12170,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -12187,11 +12187,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12205,13 +12205,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -12222,11 +12222,11 @@
         .m(4)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12240,13 +12240,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12258,12 +12258,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12276,12 +12276,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12295,13 +12295,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12313,12 +12313,12 @@
           .m(4)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12331,12 +12331,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12350,13 +12350,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, small_kernel) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -12368,11 +12368,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12387,13 +12387,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12406,12 +12406,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12424,12 +12424,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -12444,13 +12444,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, a_offset) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -12463,11 +12463,11 @@
         .k(k)
         .ks(3)
         .a_offset(43)
-        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, zero) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12482,12 +12482,12 @@
           .ks(3)
           .a_offset(43)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, qmin) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12498,10 +12498,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, qmax) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12512,10 +12512,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_4X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -12526,13 +12526,13 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_eq_2) {
+#if XNN_ARCH_ARM64
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -12542,10 +12542,10 @@
       .m(6)
       .n(8)
       .k(2)
-      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, strided_cn) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -12556,10 +12556,10 @@
       .n(8)
       .k(2)
       .cn_stride(11)
-      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_eq_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 6; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -12572,12 +12572,12 @@
           .n(n)
           .k(2)
           .iterations(1)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_eq_2_subtile_m) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 6; m++) {
       GemmMicrokernelTester()
@@ -12589,11 +12589,11 @@
         .n(8)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_eq_2_subtile_n) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -12605,11 +12605,11 @@
         .n(n)
         .k(2)
         .iterations(1)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_lt_2) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_lt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       GemmMicrokernelTester()
@@ -12620,11 +12620,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_lt_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k < 2; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -12638,13 +12638,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_gt_2) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_gt_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       GemmMicrokernelTester()
@@ -12655,11 +12655,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_gt_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 3; k < 4; k++) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -12673,13 +12673,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_div_2) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_div_2) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       GemmMicrokernelTester()
@@ -12690,11 +12690,11 @@
         .m(6)
         .n(8)
         .k(k)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, k_div_2_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 4; k <= 20; k += 2) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -12708,13 +12708,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_gt_8) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12726,12 +12726,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_gt_8_strided_cn) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12744,12 +12744,12 @@
           .n(8)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_gt_8_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12763,13 +12763,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_div_8) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12781,12 +12781,12 @@
           .m(6)
           .n(8)
           .k(k)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_div_8_strided_cn) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12799,12 +12799,12 @@
           .n(n)
           .k(k)
           .cn_stride(11)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_div_8_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12818,13 +12818,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, small_kernel) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -12836,11 +12836,11 @@
         .n(8)
         .k(k)
         .ks(3)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, small_kernel_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -12855,13 +12855,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_gt_8_small_kernel) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12874,12 +12874,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, n_div_8_small_kernel) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12892,12 +12892,12 @@
           .n(8)
           .k(k)
           .ks(3)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, strided_cm_subtile) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       for (uint32_t m = 1; m <= 6; m++) {
@@ -12912,13 +12912,13 @@
             .k(k)
             .cm_stride(11)
             .iterations(1)
-            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+            .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
         }
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, a_offset) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (size_t k = 1; k <= 10; k += 3) {
       GemmMicrokernelTester()
@@ -12931,11 +12931,11 @@
         .k(k)
         .ks(3)
         .a_offset(67)
-        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+        .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, zero) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t mz = 0; mz < 6; mz++) {
       for (size_t k = 1; k <= 10; k += 3) {
@@ -12950,12 +12950,12 @@
           .ks(3)
           .a_offset(67)
           .zero_index(mz)
-          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+          .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
       }
     }
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, qmin) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -12966,10 +12966,10 @@
       .n(8)
       .k(2)
       .qmin(128)
-      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, qmax) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -12980,10 +12980,10 @@
       .n(8)
       .k(2)
       .qmax(128)
-      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
   }
 
-  TEST(F32_IGEMM_6X8__NEONFMA_LD64, strided_cm) {
+  TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(6)
@@ -12994,9 +12994,9 @@
       .n(8)
       .k(2)
       .cm_stride(11)
-      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_ld64);
+      .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
   }
-#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
diff --git a/test/f32-igemm.yaml b/test/f32-igemm.yaml
index 84499e4..def84ef 100644
--- a/test/f32-igemm.yaml
+++ b/test/f32-igemm.yaml
@@ -49,17 +49,19 @@
   k-block: 4
   pipelined: true
   assembly: true
-- name: xnn_f32_igemm_ukernel_1x8__neon_ld64
+- name: xnn_f32_igemm_ukernel_1x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_4x2__neon_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_igemm_ukernel_4x2__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_4x4__neon_ld64
+- name: xnn_f32_igemm_ukernel_4x4__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_4x8__neon_ld128
+- name: xnn_f32_igemm_ukernel_4x8__neon_lane_ld128
   k-block: 4
-- name: xnn_f32_igemm_ukernel_4x8__neon_ld64
+- name: xnn_f32_igemm_ukernel_4x8__neon_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_6x8__neon_ld64
+- name: xnn_f32_igemm_ukernel_6x8__neon_lane_ld64
   k-block: 2
 - name: xnn_f32_igemm_ukernel_1x8s4__neon
   k-block: 4
@@ -69,16 +71,26 @@
   k-block: 4
 - name: xnn_f32_igemm_ukernel_8x8s4__neon
   k-block: 4
-- name: xnn_f32_igemm_ukernel_4x2__neonfma_ld64
+- name: xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_4x4__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_4x8__neonfma_ld128
+  arch:
+    - aarch64
+- name: xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128
   k-block: 4
-- name: xnn_f32_igemm_ukernel_4x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64
   k-block: 2
-- name: xnn_f32_igemm_ukernel_6x8__neonfma_ld64
+  arch:
+    - aarch64
+- name: xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64
   k-block: 2
+  arch:
+    - aarch64
 - name: xnn_f32_igemm_ukernel_1x8s4__neonfma
   k-block: 4
 - name: xnn_f32_igemm_ukernel_4x8s4__neonfma