Rename neon intrinsics to lane.

PiperOrigin-RevId: 282000418
diff --git a/src/f32-gemm/1x8-neon-ld64.c b/src/f32-gemm/1x8-neon-lane-ld64.c
similarity index 85%
rename from src/f32-gemm/1x8-neon-ld64.c
rename to src/f32-gemm/1x8-neon-lane-ld64.c
index 9c6f8b6..31f7e17 100644
--- a/src/f32-gemm/1x8-neon-ld64.c
+++ b/src/f32-gemm/1x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_1x8__neon_ld64(
+void xnn_f32_gemm_ukernel_1x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -49,13 +50,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -63,8 +64,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/1x8-neon-ld64.c b/src/f32-gemm/1x8-neonfma-lane-ld64.c
similarity index 84%
copy from src/f32-gemm/1x8-neon-ld64.c
copy to src/f32-gemm/1x8-neonfma-lane-ld64.c
index 9c6f8b6..428e181 100644
--- a/src/f32-gemm/1x8-neon-ld64.c
+++ b/src/f32-gemm/1x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_1x8__neon_ld64(
+void xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -49,13 +50,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -63,8 +64,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/1x8-neonfma-ld64.c b/src/f32-gemm/1x8-neonfma-ld64.c
deleted file mode 100644
index 4df05f9..0000000
--- a/src/f32-gemm/1x8-neonfma-ld64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_1x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/4x2-neon-ld64.c b/src/f32-gemm/4x2-neon-lane-ld64.c
similarity index 98%
rename from src/f32-gemm/4x2-neon-ld64.c
rename to src/f32-gemm/4x2-neon-lane-ld64.c
index 872a842..0ec7cbb 100644
--- a/src/f32-gemm/4x2-neon-ld64.c
+++ b/src/f32-gemm/4x2-neon-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x2__neon_ld64(
+void xnn_f32_gemm_ukernel_4x2__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-gemm/4x2-neonfma-ld64.c b/src/f32-gemm/4x2-neonfma-lane-ld64.c
similarity index 98%
rename from src/f32-gemm/4x2-neonfma-ld64.c
rename to src/f32-gemm/4x2-neonfma-lane-ld64.c
index 4014c0e..bf81b04 100644
--- a/src/f32-gemm/4x2-neonfma-ld64.c
+++ b/src/f32-gemm/4x2-neonfma-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x2__neonfma_ld64(
+void xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-gemm/4x8-neon-ld128.c b/src/f32-gemm/4x8-neon-lane-ld128.c
similarity index 64%
copy from src/f32-gemm/4x8-neon-ld128.c
copy to src/f32-gemm/4x8-neon-lane-ld128.c
index 3ebede2..b8561d3 100644
--- a/src/f32-gemm/4x8-neon-ld128.c
+++ b/src/f32-gemm/4x8-neon-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld128(
+void xnn_f32_gemm_ukernel_4x8__neon_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -77,50 +78,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -132,14 +133,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemm/4x8-neon-ld64.c b/src/f32-gemm/4x8-neon-lane-ld64.c
similarity index 77%
copy from src/f32-gemm/4x8-neon-ld64.c
copy to src/f32-gemm/4x8-neon-lane-ld64.c
index 6c19c67..af9ff56 100644
--- a/src/f32-gemm/4x8-neon-ld64.c
+++ b/src/f32-gemm/4x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld64(
+void xnn_f32_gemm_ukernel_4x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -76,25 +77,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -105,14 +106,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/4x8-neon-ld128.c b/src/f32-gemm/4x8-neonfma-lane-ld128.c
similarity index 64%
rename from src/f32-gemm/4x8-neon-ld128.c
rename to src/f32-gemm/4x8-neonfma-lane-ld128.c
index 3ebede2..ee3e06a 100644
--- a/src/f32-gemm/4x8-neon-ld128.c
+++ b/src/f32-gemm/4x8-neonfma-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld128(
+void xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -77,50 +78,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -132,14 +133,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemm/4x8-neon-ld64.c b/src/f32-gemm/4x8-neonfma-lane-ld64.c
similarity index 77%
rename from src/f32-gemm/4x8-neon-ld64.c
rename to src/f32-gemm/4x8-neonfma-lane-ld64.c
index 6c19c67..6a7b6f9 100644
--- a/src/f32-gemm/4x8-neon-ld64.c
+++ b/src/f32-gemm/4x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_4x8__neon_ld64(
+void xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -76,25 +77,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -105,14 +106,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/4x8-neonfma-ld128.c b/src/f32-gemm/4x8-neonfma-ld128.c
deleted file mode 100644
index 08888d6..0000000
--- a/src/f32-gemm/4x8-neonfma-ld128.c
+++ /dev/null
@@ -1,285 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld128.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_4x8__neonfma_ld128(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
-      const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
-      const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
-      const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
-      const float32x4_t va3 = vld1q_f32(a3); a3 += 4;
-
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-
-      const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c2, va0, 2);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c2, va1, 2);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c2, va2, 2);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c2, va3, 2);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c2, va0, 2);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c2, va1, 2);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c2, va2, 2);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c2, va3, 2);
-      #else
-        const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0);
-        const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0);
-        const float32x4_t va2c2 = vdupq_lane_f32(vget_high_f32(va2), 0);
-        const float32x4_t va3c2 = vdupq_lane_f32(vget_high_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c2, vb0123c2);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c2, vb0123c2);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c2, vb0123c2);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c2, vb0123c2);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c2, vb4567c2);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c2, vb4567c2);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c2, vb4567c2);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c2, vb4567c2);
-      #endif
-
-      const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c3, va0, 3);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c3, va1, 3);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c3, va2, 3);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c3, va3, 3);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c3, va0, 3);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c3, va1, 3);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c3, va2, 3);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c3, va3, 3);
-      #else
-        const float32x4_t va0c3 = vdupq_lane_f32(vget_high_f32(va0), 1);
-        const float32x4_t va1c3 = vdupq_lane_f32(vget_high_f32(va1), 1);
-        const float32x4_t va2c3 = vdupq_lane_f32(vget_high_f32(va2), 1);
-        const float32x4_t va3c3 = vdupq_lane_f32(vget_high_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c3, vb0123c3);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c3, vb0123c3);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c3, vb0123c3);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c3, vb0123c3);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c3, vb4567c3);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c3, vb4567c3);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c3, vb4567c3);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c3, vb4567c3);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      do {
-        const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-        const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-        const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-        const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-
-        k -= sizeof(float);
-      } while (k != 0);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/4x8-neonfma-ld64.c b/src/f32-gemm/4x8-neonfma-ld64.c
deleted file mode 100644
index c0f9dd5..0000000
--- a/src/f32-gemm/4x8-neonfma-ld64.c
+++ /dev/null
@@ -1,225 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_4x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/5x8-neon-ld64.c b/src/f32-gemm/5x8-neon-lane-ld64.c
similarity index 76%
rename from src/f32-gemm/5x8-neon-ld64.c
rename to src/f32-gemm/5x8-neon-lane-ld64.c
index 4ca0a20..1c46d30 100644
--- a/src/f32-gemm/5x8-neon-ld64.c
+++ b/src/f32-gemm/5x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_5x8__neon_ld64(
+void xnn_f32_gemm_ukernel_5x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -85,29 +86,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -119,16 +120,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/5x8-neon-ld64.c b/src/f32-gemm/5x8-neonfma-lane-ld64.c
similarity index 76%
copy from src/f32-gemm/5x8-neon-ld64.c
copy to src/f32-gemm/5x8-neonfma-lane-ld64.c
index 4ca0a20..4b2d2b8 100644
--- a/src/f32-gemm/5x8-neon-ld64.c
+++ b/src/f32-gemm/5x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_5x8__neon_ld64(
+void xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -85,29 +86,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -119,16 +120,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/5x8-neonfma-ld64.c b/src/f32-gemm/5x8-neonfma-ld64.c
deleted file mode 100644
index 515db66..0000000
--- a/src/f32-gemm/5x8-neonfma-ld64.c
+++ /dev/null
@@ -1,261 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_5x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 5);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-    float32x4_t vacc4x0123 = vacc0x0123;
-    float32x4_t vacc4x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/6x8-neon-ld64.c b/src/f32-gemm/6x8-neon-lane-ld64.c
similarity index 75%
rename from src/f32-gemm/6x8-neon-ld64.c
rename to src/f32-gemm/6x8-neon-lane-ld64.c
index b2322d8..e6ca249 100644
--- a/src/f32-gemm/6x8-neon-ld64.c
+++ b/src/f32-gemm/6x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_6x8__neon_ld64(
+void xnn_f32_gemm_ukernel_6x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -94,33 +95,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -133,18 +134,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/6x8-neon-ld64.c b/src/f32-gemm/6x8-neonfma-lane-ld64.c
similarity index 75%
copy from src/f32-gemm/6x8-neon-ld64.c
copy to src/f32-gemm/6x8-neonfma-lane-ld64.c
index b2322d8..0a34f44 100644
--- a/src/f32-gemm/6x8-neon-ld64.c
+++ b/src/f32-gemm/6x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm_ukernel_6x8__neon_ld64(
+void xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -94,33 +95,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -133,18 +134,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemm/6x8-neonfma-ld64.c b/src/f32-gemm/6x8-neonfma-ld64.c
deleted file mode 100644
index 682dd07..0000000
--- a/src/f32-gemm/6x8-neonfma-ld64.c
+++ /dev/null
@@ -1,297 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemm_ukernel_6x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 6);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
-  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-    float32x4_t vacc4x0123 = vacc0x0123;
-    float32x4_t vacc4x4567 = vacc0x4567;
-    float32x4_t vacc5x0123 = vacc0x0123;
-    float32x4_t vacc5x4567 = vacc0x4567;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-      const float32x2_t va5 = vld1_f32(a5); a5 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        const float32x4_t va5c0 = vdupq_lane_f32(va5, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        const float32x4_t va5c1 = vdupq_lane_f32(va5, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-      const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vfmaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vfmaq_f32(vacc5x4567, va5,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc5x0123 = vminq_f32(vacc5x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-    vacc5x4567 = vminq_f32(vacc5x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-    vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c5, vacc5x0123);
-      vst1q_f32(c5 + 4, vacc5x4567);
-      c5 = (float*) ((uintptr_t) c5 + cn_stride);
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a5 = (const float*) ((uintptr_t) a5 - kc);
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c5, vacc5x0123); c5 += 4;
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc5x0123 = vacc5x4567;
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c5, vacc5x01); c5 += 2;
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc5x01 = vget_high_f32(vacc5x0123);
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c5, vacc5x01, 0);
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemm/MRx2-neon-ld64.c.in b/src/f32-gemm/MRx2-neon-ld64.c.in
index 0b94074..718d410 100644
--- a/src/f32-gemm/MRx2-neon-ld64.c.in
+++ b/src/f32-gemm/MRx2-neon-ld64.c.in
@@ -11,7 +11,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld64(
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-gemm/neon-ld128.c.in b/src/f32-gemm/neon-ld128.c.in
index b6cad3c..ecee857 100644
--- a/src/f32-gemm/neon-ld128.c.in
+++ b/src/f32-gemm/neon-ld128.c.in
@@ -5,6 +5,9 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -12,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld128(
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_${"dup" if DUP else "lane"}_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -81,22 +84,16 @@
         $for N in range(0, NR, 4):
           const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if FMA:
-          #if defined(__aarch64__)
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_laneq_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, va${M}, ${L});
-          #else
+        $if DUP:
+          $for M in range(MR):
+            const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L % 2});
+          $for N in range(0, NR, 4):
             $for M in range(MR):
-              const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L %   2});
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]},   va${M}c${L}, vb${ABC[N:N+4]}c${L});
-          #endif
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
         $else:
           $for N in range(0, NR, 4):
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = vmlaq_lane_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, ${VGET_PART_F32}(va${M}), ${L % 2});
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, ${VGET_PART_F32}(va${M}), ${L % 2});
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -108,10 +105,7 @@
 
         $for N in range(0, NR, 4):
           $for M in range(MR):
-            $if FMA:
-              vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
-            $else:
-              vacc${M}x${ABC[N:N+4]} = vmlaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
+            vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemm/neon-ld64.c.in b/src/f32-gemm/neon-ld64.c.in
index 5a89619..76e0d67 100644
--- a/src/f32-gemm/neon-ld64.c.in
+++ b/src/f32-gemm/neon-ld64.c.in
@@ -5,6 +5,9 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -12,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld64(
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_${"dup" if DUP else "lane"}_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -79,22 +82,16 @@
         $for N in range(0, NR, 4):
           const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if FMA:
-          #if defined(__aarch64__)
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_lane_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, va${M}, ${L});
-          #else
-            $for M in range(MR):
-              const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
-            $for N in range(0, NR, 4):
-              $for M in range(MR):
-                vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]},   va${M}c${L}, vb${ABC[N:N+4]}c${L});
-          #endif
-        $else:
+        $if DUP:
+          $for M in range(MR):
+            const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
           $for N in range(0, NR, 4):
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = vmlaq_lane_f32(vacc${M}x${ABC[N:N+4]},   vb${ABC[N:N+4]}c${L}, va${M}, ${L});
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
+        $else:
+           $for N in range(0, NR, 4):
+             $for M in range(MR):
+               vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, va${M}, ${L});
     }
     if XNN_UNLIKELY(k != 0) {
       $for M in range(MR):
@@ -105,10 +102,7 @@
 
       $for N in range(0, NR, 4):
         $for M in range(MR):
-          $if FMA:
-            vacc${M}x${ABC[N:N+4]} = vfmaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
-          $else:
-            vacc${M}x${ABC[N:N+4]} = vmlaq_f32(vacc${M}x${ABC[N:N+4]}, va${M},   vb${ABC[N:N+4]});
+          vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     $for N in range(0, NR, 4):
diff --git a/src/f32-gemminc/1x8-neon-ld64.c b/src/f32-gemminc/1x8-neon-lane-ld64.c
similarity index 85%
rename from src/f32-gemminc/1x8-neon-ld64.c
rename to src/f32-gemminc/1x8-neon-lane-ld64.c
index a5d04dd..5ddcd4f 100644
--- a/src/f32-gemminc/1x8-neon-ld64.c
+++ b/src/f32-gemminc/1x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_1x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -51,13 +52,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -65,8 +66,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/1x8-neon-ld64.c b/src/f32-gemminc/1x8-neonfma-lane-ld64.c
similarity index 85%
copy from src/f32-gemminc/1x8-neon-ld64.c
copy to src/f32-gemminc/1x8-neonfma-lane-ld64.c
index a5d04dd..ff189db 100644
--- a/src/f32-gemminc/1x8-neon-ld64.c
+++ b/src/f32-gemminc/1x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_1x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -51,13 +52,13 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -65,8 +66,8 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/1x8-neonfma-ld64.c b/src/f32-gemminc/1x8-neonfma-ld64.c
deleted file mode 100644
index d67a419..0000000
--- a/src/f32-gemminc/1x8-neonfma-ld64.c
+++ /dev/null
@@ -1,119 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_1x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/4x8-neon-ld128.c b/src/f32-gemminc/4x8-neon-lane-ld128.c
similarity index 65%
copy from src/f32-gemminc/4x8-neon-ld128.c
copy to src/f32-gemminc/4x8-neon-lane-ld128.c
index 07d7562..9117e4f 100644
--- a/src/f32-gemminc/4x8-neon-ld128.c
+++ b/src/f32-gemminc/4x8-neon-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld128(
+void xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -79,50 +80,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -134,14 +135,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemminc/4x8-neon-ld64.c b/src/f32-gemminc/4x8-neon-lane-ld64.c
similarity index 77%
copy from src/f32-gemminc/4x8-neon-ld64.c
copy to src/f32-gemminc/4x8-neon-lane-ld64.c
index f7677c3..a301d1e 100644
--- a/src/f32-gemminc/4x8-neon-ld64.c
+++ b/src/f32-gemminc/4x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -78,25 +79,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -107,14 +108,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/4x8-neon-ld128.c b/src/f32-gemminc/4x8-neonfma-lane-ld128.c
similarity index 65%
rename from src/f32-gemminc/4x8-neon-ld128.c
rename to src/f32-gemminc/4x8-neonfma-lane-ld128.c
index 07d7562..b73a0fa 100644
--- a/src/f32-gemminc/4x8-neon-ld128.c
+++ b/src/f32-gemminc/4x8-neonfma-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld128(
+void xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -79,50 +80,50 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
       const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
       const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
     }
     if XNN_UNLIKELY(k != 0) {
       do {
@@ -134,14 +135,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
 
         k -= sizeof(float);
       } while (k != 0);
diff --git a/src/f32-gemminc/4x8-neon-ld64.c b/src/f32-gemminc/4x8-neonfma-lane-ld64.c
similarity index 77%
rename from src/f32-gemminc/4x8-neon-ld64.c
rename to src/f32-gemminc/4x8-neonfma-lane-ld64.c
index f7677c3..6e15065 100644
--- a/src/f32-gemminc/4x8-neon-ld64.c
+++ b/src/f32-gemminc/4x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_4x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -78,25 +79,25 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -107,14 +108,14 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/4x8-neonfma-ld128.c b/src/f32-gemminc/4x8-neonfma-ld128.c
deleted file mode 100644
index 49c074c..0000000
--- a/src/f32-gemminc/4x8-neonfma-ld128.c
+++ /dev/null
@@ -1,287 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld128.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_4x8__neonfma_ld128(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
-      const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
-      const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
-      const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
-      const float32x4_t va3 = vld1q_f32(a3); a3 += 4;
-
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-
-      const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c2, va0, 2);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c2, va1, 2);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c2, va2, 2);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c2, va3, 2);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c2, va0, 2);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c2, va1, 2);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c2, va2, 2);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c2, va3, 2);
-      #else
-        const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0);
-        const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0);
-        const float32x4_t va2c2 = vdupq_lane_f32(vget_high_f32(va2), 0);
-        const float32x4_t va3c2 = vdupq_lane_f32(vget_high_f32(va3), 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c2, vb0123c2);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c2, vb0123c2);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c2, vb0123c2);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c2, vb0123c2);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c2, vb4567c2);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c2, vb4567c2);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c2, vb4567c2);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c2, vb4567c2);
-      #endif
-
-      const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c3, va0, 3);
-        vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c3, va1, 3);
-        vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c3, va2, 3);
-        vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c3, va3, 3);
-        vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c3, va0, 3);
-        vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c3, va1, 3);
-        vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c3, va2, 3);
-        vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c3, va3, 3);
-      #else
-        const float32x4_t va0c3 = vdupq_lane_f32(vget_high_f32(va0), 1);
-        const float32x4_t va1c3 = vdupq_lane_f32(vget_high_f32(va1), 1);
-        const float32x4_t va2c3 = vdupq_lane_f32(vget_high_f32(va2), 1);
-        const float32x4_t va3c3 = vdupq_lane_f32(vget_high_f32(va3), 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c3, vb0123c3);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c3, vb0123c3);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c3, vb0123c3);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c3, vb0123c3);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c3, vb4567c3);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c3, vb4567c3);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c3, vb4567c3);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c3, vb4567c3);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      do {
-        const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-        const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-        const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-        const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-
-        k -= sizeof(float);
-      } while (k != 0);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/4x8-neonfma-ld64.c b/src/f32-gemminc/4x8-neonfma-ld64.c
deleted file mode 100644
index f0eefcf..0000000
--- a/src/f32-gemminc/4x8-neonfma-ld64.c
+++ /dev/null
@@ -1,227 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_4x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/5x8-neon-ld64.c b/src/f32-gemminc/5x8-neon-lane-ld64.c
similarity index 76%
rename from src/f32-gemminc/5x8-neon-ld64.c
rename to src/f32-gemminc/5x8-neon-lane-ld64.c
index f2fe644..cb07b18 100644
--- a/src/f32-gemminc/5x8-neon-ld64.c
+++ b/src/f32-gemminc/5x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_5x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -87,29 +88,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -121,16 +122,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/5x8-neon-ld64.c b/src/f32-gemminc/5x8-neonfma-lane-ld64.c
similarity index 76%
copy from src/f32-gemminc/5x8-neon-ld64.c
copy to src/f32-gemminc/5x8-neonfma-lane-ld64.c
index f2fe644..1efd94c 100644
--- a/src/f32-gemminc/5x8-neon-ld64.c
+++ b/src/f32-gemminc/5x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_5x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -87,29 +88,29 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -121,16 +122,16 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/5x8-neonfma-ld64.c b/src/f32-gemminc/5x8-neonfma-ld64.c
deleted file mode 100644
index 6bbada0..0000000
--- a/src/f32-gemminc/5x8-neonfma-ld64.c
+++ /dev/null
@@ -1,263 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_5x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 5);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-gemminc/6x8-neon-ld64.c b/src/f32-gemminc/6x8-neon-lane-ld64.c
similarity index 76%
copy from src/f32-gemminc/6x8-neon-ld64.c
copy to src/f32-gemminc/6x8-neon-lane-ld64.c
index f657dc4..0dad0fd 100644
--- a/src/f32-gemminc/6x8-neon-ld64.c
+++ b/src/f32-gemminc/6x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_6x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -96,33 +97,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -135,18 +136,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/6x8-neon-ld64.c b/src/f32-gemminc/6x8-neonfma-lane-ld64.c
similarity index 76%
rename from src/f32-gemminc/6x8-neon-ld64.c
rename to src/f32-gemminc/6x8-neonfma-lane-ld64.c
index f657dc4..7c2fe58 100644
--- a/src/f32-gemminc/6x8-neon-ld64.c
+++ b/src/f32-gemminc/6x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/gemm.h>
 
 
-void xnn_f32_gemminc_ukernel_6x8__neon_ld64(
+void xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -96,33 +97,33 @@
       const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
       const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-      vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-      vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-      vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-      vacc4x0123 = vmlaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-      vacc5x0123 = vmlaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-      vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-      vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-      vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-      vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-      vacc4x4567 = vmlaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-      vacc5x4567 = vmlaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
+      vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+      vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+      vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+      vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+      vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+      vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+      vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+      vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+      vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+      vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+      vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+      vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
     }
     if XNN_UNLIKELY(k != 0) {
       const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
@@ -135,18 +136,18 @@
       const float32x4_t vb0123 = vld1q_f32(w); w += 4;
       const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-      vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vmlaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vmlaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vmlaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vmlaq_f32(vacc5x4567, va5,   vb4567);
+      vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+      vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+      vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+      vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+      vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+      vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
+      vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+      vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+      vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+      vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+      vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
+      vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
     }
     const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
     vacc0x0123 = vminq_f32(vacc0x0123, vmax);
diff --git a/src/f32-gemminc/6x8-neonfma-ld64.c b/src/f32-gemminc/6x8-neonfma-ld64.c
deleted file mode 100644
index 22519ef..0000000
--- a/src/f32-gemminc/6x8-neonfma-ld64.c
+++ /dev/null
@@ -1,299 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-gemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/gemm.h>
-
-
-void xnn_f32_gemminc_ukernel_6x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    const float* restrict a,
-    size_t a_stride,
-    const float* restrict w,
-    float* restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    const float*restrict acc,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 6);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-  assert(acc != NULL);
-
-  const float* a0 = a;
-  float* c0 = c;
-  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    a1 = a0;
-    c1 = c0;
-  }
-  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    a2 = a1;
-    c2 = c1;
-  }
-  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    a3 = a2;
-    c3 = c2;
-  }
-  const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    a4 = a3;
-    c4 = c3;
-  }
-  const float* a5 = (const float*) ((uintptr_t) a4 + a_stride);
-  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 6) {
-    a5 = a4;
-    c5 = c4;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc1x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc2x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc3x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc4x4567 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc5x0123 = vld1q_f32(acc); acc += 4;
-    float32x4_t vacc5x4567 = vld1q_f32(acc); acc += 4;
-
-    size_t k = kc;
-    for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-      const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-      const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-      const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-      const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-      const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-      const float32x2_t va5 = vld1_f32(a5); a5 += 2;
-
-      const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c0, va0, 0);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c0, va1, 0);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c0, va2, 0);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c0, va3, 0);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c0, va4, 0);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c0, va5, 0);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c0, va0, 0);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c0, va1, 0);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c0, va2, 0);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c0, va3, 0);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c0, va4, 0);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c0, va5, 0);
-      #else
-        const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-        const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-        const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-        const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-        const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-        const float32x4_t va5c0 = vdupq_lane_f32(va5, 0);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c0, vb0123c0);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c0, vb0123c0);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c0, vb4567c0);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c0, vb4567c0);
-      #endif
-      const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-      #if defined(__aarch64__)
-        vacc0x0123 = vfmaq_lane_f32(vacc0x0123,   vb0123c1, va0, 1);
-        vacc1x0123 = vfmaq_lane_f32(vacc1x0123,   vb0123c1, va1, 1);
-        vacc2x0123 = vfmaq_lane_f32(vacc2x0123,   vb0123c1, va2, 1);
-        vacc3x0123 = vfmaq_lane_f32(vacc3x0123,   vb0123c1, va3, 1);
-        vacc4x0123 = vfmaq_lane_f32(vacc4x0123,   vb0123c1, va4, 1);
-        vacc5x0123 = vfmaq_lane_f32(vacc5x0123,   vb0123c1, va5, 1);
-        vacc0x4567 = vfmaq_lane_f32(vacc0x4567,   vb4567c1, va0, 1);
-        vacc1x4567 = vfmaq_lane_f32(vacc1x4567,   vb4567c1, va1, 1);
-        vacc2x4567 = vfmaq_lane_f32(vacc2x4567,   vb4567c1, va2, 1);
-        vacc3x4567 = vfmaq_lane_f32(vacc3x4567,   vb4567c1, va3, 1);
-        vacc4x4567 = vfmaq_lane_f32(vacc4x4567,   vb4567c1, va4, 1);
-        vacc5x4567 = vfmaq_lane_f32(vacc5x4567,   vb4567c1, va5, 1);
-      #else
-        const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-        const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-        const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-        const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-        const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-        const float32x4_t va5c1 = vdupq_lane_f32(va5, 1);
-        vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-        vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-        vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-        vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-        vacc4x0123 = vfmaq_f32(vacc4x0123,   va4c1, vb0123c1);
-        vacc5x0123 = vfmaq_f32(vacc5x0123,   va5c1, vb0123c1);
-        vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-        vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-        vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-        vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        vacc4x4567 = vfmaq_f32(vacc4x4567,   va4c1, vb4567c1);
-        vacc5x4567 = vfmaq_f32(vacc5x4567,   va5c1, vb4567c1);
-      #endif
-    }
-    if XNN_UNLIKELY(k != 0) {
-      const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-      const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-      const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-      const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-      const float32x4_t va4 = vld1q_dup_f32(a4); a4 += 1;
-      const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1;
-
-      const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-      const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-      vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-      vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-      vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-      vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-      vacc4x0123 = vfmaq_f32(vacc4x0123, va4,   vb0123);
-      vacc5x0123 = vfmaq_f32(vacc5x0123, va5,   vb0123);
-      vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-      vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-      vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-      vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-      vacc4x4567 = vfmaq_f32(vacc4x4567, va4,   vb4567);
-      vacc5x4567 = vfmaq_f32(vacc5x4567, va5,   vb4567);
-    }
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc5x0123 = vminq_f32(vacc5x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-    vacc5x4567 = vminq_f32(vacc5x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-    vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c5, vacc5x0123);
-      vst1q_f32(c5 + 4, vacc5x4567);
-      c5 = (float*) ((uintptr_t) c5 + cn_stride);
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a5 = (const float*) ((uintptr_t) a5 - kc);
-      a4 = (const float*) ((uintptr_t) a4 - kc);
-      a3 = (const float*) ((uintptr_t) a3 - kc);
-      a2 = (const float*) ((uintptr_t) a2 - kc);
-      a1 = (const float*) ((uintptr_t) a1 - kc);
-      a0 = (const float*) ((uintptr_t) a0 - kc);
-
-      nc -= 8;
-
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c5, vacc5x0123); c5 += 4;
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc5x0123 = vacc5x4567;
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c5, vacc5x01); c5 += 2;
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc5x01 = vget_high_f32(vacc5x0123);
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c5, vacc5x01, 0);
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/1x8-neon-ld64.c b/src/f32-igemm/1x8-neon-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/1x8-neon-ld64.c
rename to src/f32-igemm/1x8-neon-lane-ld64.c
index 72768ed..157b73b 100644
--- a/src/f32-igemm/1x8-neon-ld64.c
+++ b/src/f32-igemm/1x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_1x8__neon_ld64(
+void xnn_f32_igemm_ukernel_1x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -69,6 +70,7 @@
 
         vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
         vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/1x8-neon-ld64.c b/src/f32-igemm/1x8-neonfma-lane-ld64.c
similarity index 86%
copy from src/f32-igemm/1x8-neon-ld64.c
copy to src/f32-igemm/1x8-neonfma-lane-ld64.c
index 72768ed..78095b9 100644
--- a/src/f32-igemm/1x8-neon-ld64.c
+++ b/src/f32-igemm/1x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_1x8__neon_ld64(
+void xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -62,13 +63,14 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -76,8 +78,8 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
       }
       p -= 1 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/1x8-neonfma-ld64.c b/src/f32-igemm/1x8-neonfma-ld64.c
deleted file mode 100644
index b10885c..0000000
--- a/src/f32-igemm/1x8-neonfma-ld64.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_1x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 1);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (1 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      a += 1;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c1, vb4567c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
-      }
-      p -= 1 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/4x2-neon-ld64.c b/src/f32-igemm/4x2-neon-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/4x2-neon-ld64.c
rename to src/f32-igemm/4x2-neon-lane-ld64.c
index f6ca89c..cbd75b8 100644
--- a/src/f32-igemm/4x2-neon-ld64.c
+++ b/src/f32-igemm/4x2-neon-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x2__neon_ld64(
+void xnn_f32_igemm_ukernel_4x2__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-igemm/4x2-neonfma-ld64.c b/src/f32-igemm/4x2-neonfma-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/4x2-neonfma-ld64.c
rename to src/f32-igemm/4x2-neonfma-lane-ld64.c
index c675e2e..080f4b5 100644
--- a/src/f32-igemm/4x2-neonfma-ld64.c
+++ b/src/f32-igemm/4x2-neonfma-lane-ld64.c
@@ -14,7 +14,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x2__neonfma_ld64(
+void xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-igemm/4x4-neon-ld64.c b/src/f32-igemm/4x4-neon-lane-ld64.c
similarity index 98%
rename from src/f32-igemm/4x4-neon-ld64.c
rename to src/f32-igemm/4x4-neon-lane-ld64.c
index 71b37ca..b379470 100644
--- a/src/f32-igemm/4x4-neon-ld64.c
+++ b/src/f32-igemm/4x4-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x4__neon_ld64(
+void xnn_f32_igemm_ukernel_4x4__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -103,6 +104,7 @@
         vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
         vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
         vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/4x4-neon-ld64.c b/src/f32-igemm/4x4-neonfma-lane-ld64.c
similarity index 85%
copy from src/f32-igemm/4x4-neon-ld64.c
copy to src/f32-igemm/4x4-neonfma-lane-ld64.c
index 71b37ca..10992a5 100644
--- a/src/f32-igemm/4x4-neon-ld64.c
+++ b/src/f32-igemm/4x4-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x4__neon_ld64(
+void xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -93,16 +94,17 @@
 
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -112,10 +114,10 @@
 
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/4x4-neonfma-ld64.c b/src/f32-igemm/4x4-neonfma-ld64.c
deleted file mode 100644
index e1649f3..0000000
--- a/src/f32-igemm/4x4-neonfma-ld64.c
+++ /dev/null
@@ -1,195 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_4x4__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (4 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc3x0123 = vacc0x0123;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      a += 4;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-        const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c0, vb0123c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c1, vb0123c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-        const float32x4_t va1 = vld1q_dup_f32(a1);
-        const float32x4_t va2 = vld1q_dup_f32(a2);
-        const float32x4_t va3 = vld1q_dup_f32(a3);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
-      }
-      p -= 4 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-
-    if XNN_LIKELY(nc >= 4) {
-      vst1q_f32(c3, vacc3x0123);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 4;
-    } else {
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/4x8-neon-ld128.c b/src/f32-igemm/4x8-neon-lane-ld128.c
similarity index 66%
rename from src/f32-igemm/4x8-neon-ld128.c
rename to src/f32-igemm/4x8-neon-lane-ld128.c
index cc3e3e3..31aa833 100644
--- a/src/f32-igemm/4x8-neon-ld128.c
+++ b/src/f32-igemm/4x8-neon-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld128(
+void xnn_f32_igemm_ukernel_4x8__neon_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -99,50 +100,50 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
         const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
         const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
       }
       if XNN_UNLIKELY(k != 0) {
         do {
@@ -154,14 +155,14 @@
           const float32x4_t vb0123 = vld1q_f32(w); w += 4;
           const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-          vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-          vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-          vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-          vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-          vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-          vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-          vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-          vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+          vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
+          vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
+          vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
+          vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
+          vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
+          vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
+          vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
+          vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
 
           k -= sizeof(float);
         } while (k != 0);
diff --git a/src/f32-igemm/4x8-neon-ld64.c b/src/f32-igemm/4x8-neon-lane-ld64.c
similarity index 99%
rename from src/f32-igemm/4x8-neon-ld64.c
rename to src/f32-igemm/4x8-neon-lane-ld64.c
index 0997247..2fbc77a 100644
--- a/src/f32-igemm/4x8-neon-ld64.c
+++ b/src/f32-igemm/4x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld64(
+void xnn_f32_igemm_ukernel_4x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -117,6 +118,7 @@
         vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
         vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
         vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/4x8-neon-ld128.c b/src/f32-igemm/4x8-neonfma-lane-ld128.c
similarity index 66%
copy from src/f32-igemm/4x8-neon-ld128.c
copy to src/f32-igemm/4x8-neonfma-lane-ld128.c
index cc3e3e3..86d2fc3 100644
--- a/src/f32-igemm/4x8-neon-ld128.c
+++ b/src/f32-igemm/4x8-neonfma-lane-ld128.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld128(
+void xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -99,50 +100,50 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c0, vget_low_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c0, vget_low_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c0, vget_low_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c0, vget_low_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c0, vget_low_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c0, vget_low_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c0, vget_low_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c0, vget_low_f32(va3), 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, vget_low_f32(va0), 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, vget_low_f32(va1), 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, vget_low_f32(va2), 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, vget_low_f32(va3), 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, vget_low_f32(va0), 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, vget_low_f32(va1), 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, vget_low_f32(va2), 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, vget_low_f32(va3), 0);
 
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c1, vget_low_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c1, vget_low_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c1, vget_low_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c1, vget_low_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c1, vget_low_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c1, vget_low_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c1, vget_low_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c1, vget_low_f32(va3), 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, vget_low_f32(va0), 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, vget_low_f32(va1), 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, vget_low_f32(va2), 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, vget_low_f32(va3), 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, vget_low_f32(va0), 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, vget_low_f32(va1), 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, vget_low_f32(va2), 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, vget_low_f32(va3), 1);
 
         const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c2, vget_high_f32(va0), 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c2, vget_high_f32(va1), 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c2, vget_high_f32(va2), 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c2, vget_high_f32(va3), 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c2, vget_high_f32(va0), 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c2, vget_high_f32(va1), 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c2, vget_high_f32(va2), 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c2, vget_high_f32(va3), 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c2, vget_high_f32(va0), 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c2, vget_high_f32(va1), 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c2, vget_high_f32(va2), 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c2, vget_high_f32(va3), 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c2, vget_high_f32(va0), 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c2, vget_high_f32(va1), 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c2, vget_high_f32(va2), 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c2, vget_high_f32(va3), 0);
 
         const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123,   vb0123c3, vget_high_f32(va0), 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123,   vb0123c3, vget_high_f32(va1), 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123,   vb0123c3, vget_high_f32(va2), 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123,   vb0123c3, vget_high_f32(va3), 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567,   vb4567c3, vget_high_f32(va0), 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567,   vb4567c3, vget_high_f32(va1), 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567,   vb4567c3, vget_high_f32(va2), 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567,   vb4567c3, vget_high_f32(va3), 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c3, vget_high_f32(va0), 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c3, vget_high_f32(va1), 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c3, vget_high_f32(va2), 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c3, vget_high_f32(va3), 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c3, vget_high_f32(va0), 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c3, vget_high_f32(va1), 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c3, vget_high_f32(va2), 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c3, vget_high_f32(va3), 1);
       }
       if XNN_UNLIKELY(k != 0) {
         do {
@@ -154,14 +155,14 @@
           const float32x4_t vb0123 = vld1q_f32(w); w += 4;
           const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-          vacc0x0123 = vmlaq_f32(vacc0x0123, va0,   vb0123);
-          vacc1x0123 = vmlaq_f32(vacc1x0123, va1,   vb0123);
-          vacc2x0123 = vmlaq_f32(vacc2x0123, va2,   vb0123);
-          vacc3x0123 = vmlaq_f32(vacc3x0123, va3,   vb0123);
-          vacc0x4567 = vmlaq_f32(vacc0x4567, va0,   vb4567);
-          vacc1x4567 = vmlaq_f32(vacc1x4567, va1,   vb4567);
-          vacc2x4567 = vmlaq_f32(vacc2x4567, va2,   vb4567);
-          vacc3x4567 = vmlaq_f32(vacc3x4567, va3,   vb4567);
+          vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+          vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+          vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+          vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+          vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+          vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+          vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+          vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
 
           k -= sizeof(float);
         } while (k != 0);
diff --git a/src/f32-igemm/4x8-neon-ld64.c b/src/f32-igemm/4x8-neonfma-lane-ld64.c
similarity index 79%
copy from src/f32-igemm/4x8-neon-ld64.c
copy to src/f32-igemm/4x8-neonfma-lane-ld64.c
index 0997247..5503926 100644
--- a/src/f32-igemm/4x8-neon-ld64.c
+++ b/src/f32-igemm/4x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_4x8__neon_ld64(
+void xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -98,25 +99,26 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -127,14 +129,14 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
       }
       p -= 4 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/4x8-neonfma-ld128.c b/src/f32-igemm/4x8-neonfma-ld128.c
deleted file mode 100644
index a090a06..0000000
--- a/src/f32-igemm/4x8-neonfma-ld128.c
+++ /dev/null
@@ -1,306 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld128.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_4x8__neonfma_ld128(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (4 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      a += 4;
-
-      size_t k = kc;
-      for (; k >= 4 * sizeof(float); k -= 4 * sizeof(float)) {
-        const float32x4_t va0 = vld1q_f32(a0); a0 += 4;
-        const float32x4_t va1 = vld1q_f32(a1); a1 += 4;
-        const float32x4_t va2 = vld1q_f32(a2); a2 += 4;
-        const float32x4_t va3 = vld1q_f32(a3); a3 += 4;
-
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c0, va3, 0);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c0, va0, 0);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c0, va1, 0);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c0, va2, 0);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c0, va3, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(vget_low_f32(va0), 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(vget_low_f32(va1), 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(vget_low_f32(va2), 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(vget_low_f32(va3), 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c0, vb4567c0);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c0, vb4567c0);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c0, vb4567c0);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c0, vb4567c0);
-        #endif
-
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c1, va3, 1);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c1, va0, 1);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c1, va1, 1);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c1, va2, 1);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c1, va3, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(vget_low_f32(va0), 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(vget_low_f32(va1), 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(vget_low_f32(va2), 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(vget_low_f32(va3), 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c1, vb4567c1);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c1, vb4567c1);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c1, vb4567c1);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c1, vb4567c1);
-        #endif
-
-        const float32x4_t vb0123c2 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c2 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c2, va0, 2);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c2, va1, 2);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c2, va2, 2);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c2, va3, 2);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c2, va0, 2);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c2, va1, 2);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c2, va2, 2);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c2, va3, 2);
-        #else
-          const float32x4_t va0c2 = vdupq_lane_f32(vget_high_f32(va0), 0);
-          const float32x4_t va1c2 = vdupq_lane_f32(vget_high_f32(va1), 0);
-          const float32x4_t va2c2 = vdupq_lane_f32(vget_high_f32(va2), 0);
-          const float32x4_t va3c2 = vdupq_lane_f32(vget_high_f32(va3), 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c2, vb0123c2);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c2, vb0123c2);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c2, vb0123c2);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c2, vb0123c2);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c2, vb4567c2);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c2, vb4567c2);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c2, vb4567c2);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c2, vb4567c2);
-        #endif
-
-        const float32x4_t vb0123c3 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c3 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_laneq_f32(vacc0x0123,   vb0123c3, va0, 3);
-          vacc1x0123 = vfmaq_laneq_f32(vacc1x0123,   vb0123c3, va1, 3);
-          vacc2x0123 = vfmaq_laneq_f32(vacc2x0123,   vb0123c3, va2, 3);
-          vacc3x0123 = vfmaq_laneq_f32(vacc3x0123,   vb0123c3, va3, 3);
-          vacc0x4567 = vfmaq_laneq_f32(vacc0x4567,   vb4567c3, va0, 3);
-          vacc1x4567 = vfmaq_laneq_f32(vacc1x4567,   vb4567c3, va1, 3);
-          vacc2x4567 = vfmaq_laneq_f32(vacc2x4567,   vb4567c3, va2, 3);
-          vacc3x4567 = vfmaq_laneq_f32(vacc3x4567,   vb4567c3, va3, 3);
-        #else
-          const float32x4_t va0c3 = vdupq_lane_f32(vget_high_f32(va0), 1);
-          const float32x4_t va1c3 = vdupq_lane_f32(vget_high_f32(va1), 1);
-          const float32x4_t va2c3 = vdupq_lane_f32(vget_high_f32(va2), 1);
-          const float32x4_t va3c3 = vdupq_lane_f32(vget_high_f32(va3), 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123,   va0c3, vb0123c3);
-          vacc1x0123 = vfmaq_f32(vacc1x0123,   va1c3, vb0123c3);
-          vacc2x0123 = vfmaq_f32(vacc2x0123,   va2c3, vb0123c3);
-          vacc3x0123 = vfmaq_f32(vacc3x0123,   va3c3, vb0123c3);
-          vacc0x4567 = vfmaq_f32(vacc0x4567,   va0c3, vb4567c3);
-          vacc1x4567 = vfmaq_f32(vacc1x4567,   va1c3, vb4567c3);
-          vacc2x4567 = vfmaq_f32(vacc2x4567,   va2c3, vb4567c3);
-          vacc3x4567 = vfmaq_f32(vacc3x4567,   va3c3, vb4567c3);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        do {
-          const float32x4_t va0 = vld1q_dup_f32(a0); a0 += 1;
-          const float32x4_t va1 = vld1q_dup_f32(a1); a1 += 1;
-          const float32x4_t va2 = vld1q_dup_f32(a2); a2 += 1;
-          const float32x4_t va3 = vld1q_dup_f32(a3); a3 += 1;
-
-          const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-          const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0,   vb0123);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1,   vb0123);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2,   vb0123);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3,   vb0123);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0,   vb4567);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1,   vb4567);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2,   vb4567);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3,   vb4567);
-
-          k -= sizeof(float);
-        } while (k != 0);
-      }
-
-      p -= 4 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/4x8-neonfma-ld64.c b/src/f32-igemm/4x8-neonfma-ld64.c
deleted file mode 100644
index 5aff360..0000000
--- a/src/f32-igemm/4x8-neonfma-ld64.c
+++ /dev/null
@@ -1,245 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_4x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 4);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (4 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 4) {
-    c3 = c2;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      a += 4;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-        const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c1, vb4567c1);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c1, vb4567c1);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c1, vb4567c1);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c1, vb4567c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-        const float32x4_t va1 = vld1q_dup_f32(a1);
-        const float32x4_t va2 = vld1q_dup_f32(a2);
-        const float32x4_t va3 = vld1q_dup_f32(a3);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
-      }
-      p -= 4 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/6x8-neon-ld64.c b/src/f32-igemm/6x8-neon-lane-ld64.c
similarity index 99%
rename from src/f32-igemm/6x8-neon-ld64.c
rename to src/f32-igemm/6x8-neon-lane-ld64.c
index a4ca689..ec13852 100644
--- a/src/f32-igemm/6x8-neon-ld64.c
+++ b/src/f32-igemm/6x8-neon-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_6x8__neon_ld64(
+void xnn_f32_igemm_ukernel_6x8__neon_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -149,6 +150,7 @@
         vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
         vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
         vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
diff --git a/src/f32-igemm/6x8-neon-ld64.c b/src/f32-igemm/6x8-neonfma-lane-ld64.c
similarity index 77%
copy from src/f32-igemm/6x8-neon-ld64.c
copy to src/f32-igemm/6x8-neonfma-lane-ld64.c
index a4ca689..9fd0562 100644
--- a/src/f32-igemm/6x8-neon-ld64.c
+++ b/src/f32-igemm/6x8-neonfma-lane-ld64.c
@@ -7,6 +7,7 @@
 // This source code is licensed under the BSD-style license found in the
 // LICENSE file in the root directory of this source tree.
 
+
 #include <assert.h>
 
 #include <arm_neon.h>
@@ -14,7 +15,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_6x8__neon_ld64(
+void xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -122,33 +123,34 @@
         const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-        vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
-        vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
-        vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
-        vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
+        vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
+        vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
+        vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
+        vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
         const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-        vacc1x0123 = vmlaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-        vacc2x0123 = vmlaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-        vacc3x0123 = vmlaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-        vacc4x0123 = vmlaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
-        vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
-        vacc0x4567 = vmlaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-        vacc1x4567 = vmlaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-        vacc2x4567 = vmlaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-        vacc3x4567 = vmlaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
-        vacc4x4567 = vmlaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
-        vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
+        vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
+        vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
+        vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
+        vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
+        vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
+        vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
+        vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
+        vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
+        vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
+        vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
+        vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
+        vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
+
       }
       if XNN_UNLIKELY(k != 0) {
         const float32x4_t va0 = vld1q_dup_f32(a0);
@@ -161,18 +163,18 @@
         const float32x4_t vb0123 = vld1q_f32(w); w += 4;
         const float32x4_t vb4567 = vld1q_f32(w); w += 4;
 
-        vacc0x0123 = vmlaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vmlaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vmlaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vmlaq_f32(vacc3x0123, va3, vb0123);
-        vacc4x0123 = vmlaq_f32(vacc4x0123, va4, vb0123);
-        vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123);
-        vacc0x4567 = vmlaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vmlaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vmlaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vmlaq_f32(vacc3x4567, va3, vb4567);
-        vacc4x4567 = vmlaq_f32(vacc4x4567, va4, vb4567);
-        vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567);
+        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
+        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
+        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
+        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
+        vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
+        vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
+        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
+        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
+        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
+        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
+        vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
+        vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
       }
       p -= 6 * sizeof(void*);
     } while (p != 0);
diff --git a/src/f32-igemm/6x8-neonfma-ld64.c b/src/f32-igemm/6x8-neonfma-ld64.c
deleted file mode 100644
index 2f64b47..0000000
--- a/src/f32-igemm/6x8-neonfma-ld64.c
+++ /dev/null
@@ -1,321 +0,0 @@
-// Auto-generated file. Do not edit!
-//   Template: src/f32-igemm/neon-ld64.c.in
-//   Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/igemm.h>
-
-
-void xnn_f32_igemm_ukernel_6x8__neonfma_ld64(
-    size_t mr,
-    size_t nc,
-    size_t kc,
-    size_t ks,
-    const float**restrict a,
-    const float*restrict w,
-    float*restrict c,
-    size_t cm_stride,
-    size_t cn_stride,
-    size_t a_offset,
-    const float* zero,
-    const union xnn_f32_output_params params[restrict static 1])
-{
-  assert(mr != 0);
-  assert(mr <= 6);
-  assert(nc != 0);
-  assert(kc != 0);
-  assert(kc % sizeof(float) == 0);
-  assert(ks != 0);
-  assert(ks % (6 * sizeof(void*)) == 0);
-  assert(a_offset % sizeof(float) == 0);
-  assert(a != NULL);
-  assert(w != NULL);
-  assert(c != NULL);
-
-  float* c0 = c;
-  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 2) {
-    c1 = c0;
-  }
-  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 2) {
-    c2 = c1;
-  }
-  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
-  if XNN_UNPREDICTABLE(mr < 4) {
-    c3 = c2;
-  }
-  float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
-  if XNN_UNPREDICTABLE(mr <= 4) {
-    c4 = c3;
-  }
-  float* c5 = (float*) ((uintptr_t) c4 + cm_stride);
-  if XNN_UNPREDICTABLE(mr != 6) {
-    c5 = c4;
-  }
-
-  do {
-    float32x4_t vacc0x0123 = vld1q_f32(w); w += 4;
-    float32x4_t vacc0x4567 = vld1q_f32(w); w += 4;
-    float32x4_t vacc1x0123 = vacc0x0123;
-    float32x4_t vacc1x4567 = vacc0x4567;
-    float32x4_t vacc2x0123 = vacc0x0123;
-    float32x4_t vacc2x4567 = vacc0x4567;
-    float32x4_t vacc3x0123 = vacc0x0123;
-    float32x4_t vacc3x4567 = vacc0x4567;
-    float32x4_t vacc4x0123 = vacc0x0123;
-    float32x4_t vacc4x4567 = vacc0x4567;
-    float32x4_t vacc5x0123 = vacc0x0123;
-    float32x4_t vacc5x4567 = vacc0x4567;
-
-    size_t p = ks;
-    do {
-      const float* restrict a0 = a[0];
-      assert(a0 != NULL);
-      if XNN_UNPREDICTABLE(a0 != zero) {
-        a0 = (const float*) ((uintptr_t) a0 + a_offset);
-      }
-      const float* restrict a1 = a[1];
-      assert(a1 != NULL);
-      if XNN_UNPREDICTABLE(a1 != zero) {
-        a1 = (const float*) ((uintptr_t) a1 + a_offset);
-      }
-      const float* restrict a2 = a[2];
-      assert(a2 != NULL);
-      if XNN_UNPREDICTABLE(a2 != zero) {
-        a2 = (const float*) ((uintptr_t) a2 + a_offset);
-      }
-      const float* restrict a3 = a[3];
-      assert(a3 != NULL);
-      if XNN_UNPREDICTABLE(a3 != zero) {
-        a3 = (const float*) ((uintptr_t) a3 + a_offset);
-      }
-      const float* restrict a4 = a[4];
-      assert(a4 != NULL);
-      if XNN_UNPREDICTABLE(a4 != zero) {
-        a4 = (const float*) ((uintptr_t) a4 + a_offset);
-      }
-      const float* restrict a5 = a[5];
-      assert(a5 != NULL);
-      if XNN_UNPREDICTABLE(a5 != zero) {
-        a5 = (const float*) ((uintptr_t) a5 + a_offset);
-      }
-      a += 6;
-
-      size_t k = kc;
-      for (; k >= 2 * sizeof(float); k -= 2 * sizeof(float)) {
-        const float32x2_t va0 = vld1_f32(a0); a0 += 2;
-        const float32x2_t va1 = vld1_f32(a1); a1 += 2;
-        const float32x2_t va2 = vld1_f32(a2); a2 += 2;
-        const float32x2_t va3 = vld1_f32(a3); a3 += 2;
-        const float32x2_t va4 = vld1_f32(a4); a4 += 2;
-        const float32x2_t va5 = vld1_f32(a5); a5 += 2;
-
-        const float32x4_t vb0123c0 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c0 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c0, va0, 0);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c0, va1, 0);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c0, va2, 0);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c0, va3, 0);
-          vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c0, va4, 0);
-          vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, va5, 0);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c0, va0, 0);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c0, va1, 0);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c0, va2, 0);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c0, va3, 0);
-          vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c0, va4, 0);
-          vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, va5, 0);
-        #else
-          const float32x4_t va0c0 = vdupq_lane_f32(va0, 0);
-          const float32x4_t va1c0 = vdupq_lane_f32(va1, 0);
-          const float32x4_t va2c0 = vdupq_lane_f32(va2, 0);
-          const float32x4_t va3c0 = vdupq_lane_f32(va3, 0);
-          const float32x4_t va4c0 = vdupq_lane_f32(va4, 0);
-          const float32x4_t va5c0 = vdupq_lane_f32(va5, 0);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c0, vb0123c0);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c0, vb0123c0);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c0, vb0123c0);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c0, vb0123c0);
-          vacc4x0123 = vfmaq_f32(vacc4x0123, va4c0, vb0123c0);
-          vacc5x0123 = vfmaq_f32(vacc5x0123, va5c0, vb0123c0);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c0, vb4567c0);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c0, vb4567c0);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c0, vb4567c0);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c0, vb4567c0);
-          vacc4x4567 = vfmaq_f32(vacc4x4567, va4c0, vb4567c0);
-          vacc5x4567 = vfmaq_f32(vacc5x4567, va5c0, vb4567c0);
-        #endif
-        const float32x4_t vb0123c1 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567c1 = vld1q_f32(w); w += 4;
-
-        #if defined(__aarch64__)
-          vacc0x0123 = vfmaq_lane_f32(vacc0x0123, vb0123c1, va0, 1);
-          vacc1x0123 = vfmaq_lane_f32(vacc1x0123, vb0123c1, va1, 1);
-          vacc2x0123 = vfmaq_lane_f32(vacc2x0123, vb0123c1, va2, 1);
-          vacc3x0123 = vfmaq_lane_f32(vacc3x0123, vb0123c1, va3, 1);
-          vacc4x0123 = vfmaq_lane_f32(vacc4x0123, vb0123c1, va4, 1);
-          vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, va5, 1);
-          vacc0x4567 = vfmaq_lane_f32(vacc0x4567, vb4567c1, va0, 1);
-          vacc1x4567 = vfmaq_lane_f32(vacc1x4567, vb4567c1, va1, 1);
-          vacc2x4567 = vfmaq_lane_f32(vacc2x4567, vb4567c1, va2, 1);
-          vacc3x4567 = vfmaq_lane_f32(vacc3x4567, vb4567c1, va3, 1);
-          vacc4x4567 = vfmaq_lane_f32(vacc4x4567, vb4567c1, va4, 1);
-          vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, va5, 1);
-        #else
-          const float32x4_t va0c1 = vdupq_lane_f32(va0, 1);
-          const float32x4_t va1c1 = vdupq_lane_f32(va1, 1);
-          const float32x4_t va2c1 = vdupq_lane_f32(va2, 1);
-          const float32x4_t va3c1 = vdupq_lane_f32(va3, 1);
-          const float32x4_t va4c1 = vdupq_lane_f32(va4, 1);
-          const float32x4_t va5c1 = vdupq_lane_f32(va5, 1);
-          vacc0x0123 = vfmaq_f32(vacc0x0123, va0c1, vb0123c1);
-          vacc1x0123 = vfmaq_f32(vacc1x0123, va1c1, vb0123c1);
-          vacc2x0123 = vfmaq_f32(vacc2x0123, va2c1, vb0123c1);
-          vacc3x0123 = vfmaq_f32(vacc3x0123, va3c1, vb0123c1);
-          vacc4x0123 = vfmaq_f32(vacc4x0123, va4c1, vb0123c1);
-          vacc5x0123 = vfmaq_f32(vacc5x0123, va5c1, vb0123c1);
-          vacc0x4567 = vfmaq_f32(vacc0x4567, va0c1, vb4567c1);
-          vacc1x4567 = vfmaq_f32(vacc1x4567, va1c1, vb4567c1);
-          vacc2x4567 = vfmaq_f32(vacc2x4567, va2c1, vb4567c1);
-          vacc3x4567 = vfmaq_f32(vacc3x4567, va3c1, vb4567c1);
-          vacc4x4567 = vfmaq_f32(vacc4x4567, va4c1, vb4567c1);
-          vacc5x4567 = vfmaq_f32(vacc5x4567, va5c1, vb4567c1);
-        #endif
-      }
-      if XNN_UNLIKELY(k != 0) {
-        const float32x4_t va0 = vld1q_dup_f32(a0);
-        const float32x4_t va1 = vld1q_dup_f32(a1);
-        const float32x4_t va2 = vld1q_dup_f32(a2);
-        const float32x4_t va3 = vld1q_dup_f32(a3);
-        const float32x4_t va4 = vld1q_dup_f32(a4);
-        const float32x4_t va5 = vld1q_dup_f32(a5);
-
-        const float32x4_t vb0123 = vld1q_f32(w); w += 4;
-        const float32x4_t vb4567 = vld1q_f32(w); w += 4;
-
-        vacc0x0123 = vfmaq_f32(vacc0x0123, va0, vb0123);
-        vacc1x0123 = vfmaq_f32(vacc1x0123, va1, vb0123);
-        vacc2x0123 = vfmaq_f32(vacc2x0123, va2, vb0123);
-        vacc3x0123 = vfmaq_f32(vacc3x0123, va3, vb0123);
-        vacc4x0123 = vfmaq_f32(vacc4x0123, va4, vb0123);
-        vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123);
-        vacc0x4567 = vfmaq_f32(vacc0x4567, va0, vb4567);
-        vacc1x4567 = vfmaq_f32(vacc1x4567, va1, vb4567);
-        vacc2x4567 = vfmaq_f32(vacc2x4567, va2, vb4567);
-        vacc3x4567 = vfmaq_f32(vacc3x4567, va3, vb4567);
-        vacc4x4567 = vfmaq_f32(vacc4x4567, va4, vb4567);
-        vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567);
-      }
-      p -= 6 * sizeof(void*);
-    } while (p != 0);
-
-    const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
-    vacc0x0123 = vminq_f32(vacc0x0123, vmax);
-    vacc1x0123 = vminq_f32(vacc1x0123, vmax);
-    vacc2x0123 = vminq_f32(vacc2x0123, vmax);
-    vacc3x0123 = vminq_f32(vacc3x0123, vmax);
-    vacc4x0123 = vminq_f32(vacc4x0123, vmax);
-    vacc5x0123 = vminq_f32(vacc5x0123, vmax);
-    vacc0x4567 = vminq_f32(vacc0x4567, vmax);
-    vacc1x4567 = vminq_f32(vacc1x4567, vmax);
-    vacc2x4567 = vminq_f32(vacc2x4567, vmax);
-    vacc3x4567 = vminq_f32(vacc3x4567, vmax);
-    vacc4x4567 = vminq_f32(vacc4x4567, vmax);
-    vacc5x4567 = vminq_f32(vacc5x4567, vmax);
-
-    const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
-    vacc0x0123 = vmaxq_f32(vacc0x0123, vmin);
-    vacc1x0123 = vmaxq_f32(vacc1x0123, vmin);
-    vacc2x0123 = vmaxq_f32(vacc2x0123, vmin);
-    vacc3x0123 = vmaxq_f32(vacc3x0123, vmin);
-    vacc4x0123 = vmaxq_f32(vacc4x0123, vmin);
-    vacc5x0123 = vmaxq_f32(vacc5x0123, vmin);
-    vacc0x4567 = vmaxq_f32(vacc0x4567, vmin);
-    vacc1x4567 = vmaxq_f32(vacc1x4567, vmin);
-    vacc2x4567 = vmaxq_f32(vacc2x4567, vmin);
-    vacc3x4567 = vmaxq_f32(vacc3x4567, vmin);
-    vacc4x4567 = vmaxq_f32(vacc4x4567, vmin);
-    vacc5x4567 = vmaxq_f32(vacc5x4567, vmin);
-
-    if XNN_LIKELY(nc >= 8) {
-      vst1q_f32(c5, vacc5x0123);
-      vst1q_f32(c5 + 4, vacc5x4567);
-      c5 = (float*) ((uintptr_t) c5 + cn_stride);
-      vst1q_f32(c4, vacc4x0123);
-      vst1q_f32(c4 + 4, vacc4x4567);
-      c4 = (float*) ((uintptr_t) c4 + cn_stride);
-      vst1q_f32(c3, vacc3x0123);
-      vst1q_f32(c3 + 4, vacc3x4567);
-      c3 = (float*) ((uintptr_t) c3 + cn_stride);
-      vst1q_f32(c2, vacc2x0123);
-      vst1q_f32(c2 + 4, vacc2x4567);
-      c2 = (float*) ((uintptr_t) c2 + cn_stride);
-      vst1q_f32(c1, vacc1x0123);
-      vst1q_f32(c1 + 4, vacc1x4567);
-      c1 = (float*) ((uintptr_t) c1 + cn_stride);
-      vst1q_f32(c0, vacc0x0123);
-      vst1q_f32(c0 + 4, vacc0x4567);
-      c0 = (float*) ((uintptr_t) c0 + cn_stride);
-
-      a = (const float**restrict) ((uintptr_t) a - ks);
-      nc -= 8;
-    } else {
-      if (nc & 4) {
-        vst1q_f32(c5, vacc5x0123); c5 += 4;
-        vst1q_f32(c4, vacc4x0123); c4 += 4;
-        vst1q_f32(c3, vacc3x0123); c3 += 4;
-        vst1q_f32(c2, vacc2x0123); c2 += 4;
-        vst1q_f32(c1, vacc1x0123); c1 += 4;
-        vst1q_f32(c0, vacc0x0123); c0 += 4;
-
-        vacc5x0123 = vacc5x4567;
-        vacc4x0123 = vacc4x4567;
-        vacc3x0123 = vacc3x4567;
-        vacc2x0123 = vacc2x4567;
-        vacc1x0123 = vacc1x4567;
-        vacc0x0123 = vacc0x4567;
-      }
-      float32x2_t vacc5x01 = vget_low_f32(vacc5x0123);
-      float32x2_t vacc4x01 = vget_low_f32(vacc4x0123);
-      float32x2_t vacc3x01 = vget_low_f32(vacc3x0123);
-      float32x2_t vacc2x01 = vget_low_f32(vacc2x0123);
-      float32x2_t vacc1x01 = vget_low_f32(vacc1x0123);
-      float32x2_t vacc0x01 = vget_low_f32(vacc0x0123);
-      if (nc & 2) {
-        vst1_f32(c5, vacc5x01); c5 += 2;
-        vst1_f32(c4, vacc4x01); c4 += 2;
-        vst1_f32(c3, vacc3x01); c3 += 2;
-        vst1_f32(c2, vacc2x01); c2 += 2;
-        vst1_f32(c1, vacc1x01); c1 += 2;
-        vst1_f32(c0, vacc0x01); c0 += 2;
-
-        vacc5x01 = vget_high_f32(vacc5x0123);
-        vacc4x01 = vget_high_f32(vacc4x0123);
-        vacc3x01 = vget_high_f32(vacc3x0123);
-        vacc2x01 = vget_high_f32(vacc2x0123);
-        vacc1x01 = vget_high_f32(vacc1x0123);
-        vacc0x01 = vget_high_f32(vacc0x0123);
-      }
-      if (nc & 1) {
-        vst1_lane_f32(c5, vacc5x01, 0);
-        vst1_lane_f32(c4, vacc4x01, 0);
-        vst1_lane_f32(c3, vacc3x01, 0);
-        vst1_lane_f32(c2, vacc2x01, 0);
-        vst1_lane_f32(c1, vacc1x01, 0);
-        vst1_lane_f32(c0, vacc0x01, 0);
-      }
-
-      nc = 0;
-    }
-  } while (nc != 0);
-}
diff --git a/src/f32-igemm/MRx2-neon-ld64.c.in b/src/f32-igemm/MRx2-neon-ld64.c.in
index 9159f95..10bdeab 100644
--- a/src/f32-igemm/MRx2-neon-ld64.c.in
+++ b/src/f32-igemm/MRx2-neon-ld64.c.in
@@ -11,7 +11,7 @@
 #include <xnnpack/igemm.h>
 
 
-void xnn_f32_igemm_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_ld64(
+void xnn_f32_igemm_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}_lane_ld64(
     size_t mr,
     size_t nc,
     size_t kc,
diff --git a/src/f32-igemm/neon-ld128.c.in b/src/f32-igemm/neon-ld128.c.in
index b4b4b8a..472a863 100644
--- a/src/f32-igemm/neon-ld128.c.in
+++ b/src/f32-igemm/neon-ld128.c.in
@@ -5,8 +5,8 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
-$VMULADDQ_F32 = "${VMULADDQ_F32}" if FMA else "${VMULADDQ_F32}"
-$VMULADDQ_LANE_F32 = "${VMULADDQ_LANE_F32}" if FMA else "${VMULADDQ_LANE_F32}"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
 
 #include <assert.h>
 
@@ -85,16 +85,16 @@
           $for N in range(0, NR, 4):
             const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if DUP:
-          $for M in range(MR):
-            const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L % 2});
-          $for N in range(0, NR, 4):
+          $if DUP:
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
-        $else:
-          $for N in range(0, NR, 4):
-            $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, va${M}, ${L});
+              const float32x4_t va${M}c${L} = vdupq_lane_f32(${VGET_PART_F32}(va${M}), ${L % 2});
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
+          $else:
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, ${VGET_PART_F32}(va${M}), ${L % 2});
       }
       if XNN_UNLIKELY(k != 0) {
         do {
@@ -106,10 +106,7 @@
 
           $for N in range(0, NR, 4):
             $for M in range(MR):
-              $if FMA:
-                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
-              $else:
-                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
+              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
 
           k -= sizeof(float);
         } while (k != 0);
diff --git a/src/f32-igemm/neon-ld64.c.in b/src/f32-igemm/neon-ld64.c.in
index b816af4..ca93e3e 100644
--- a/src/f32-igemm/neon-ld64.c.in
+++ b/src/f32-igemm/neon-ld64.c.in
@@ -5,8 +5,8 @@
 
 $assert NR % 4 == 0
 $ABC = "0123456789ABCDEFGHIJKLMN"
-$VMULADDQ_F32 = "${VMULADDQ_F32}" if FMA else "${VMULADDQ_F32}"
-$VMULADDQ_LANE_F32 = "${VMULADDQ_LANE_F32}" if FMA else "${VMULADDQ_LANE_F32}"
+$VMULADDQ_F32 = "vfmaq_f32" if FMA else "vmlaq_f32"
+$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
 
 #include <assert.h>
 
@@ -83,16 +83,16 @@
           $for N in range(0, NR, 4):
             const float32x4_t vb${ABC[N:N+4]}c${L} = vld1q_f32(w); w += 4;
 
-        $if DUP:
-          $for M in range(MR):
-            const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
-          $for N in range(0, NR, 4):
+          $if DUP:
             $for M in range(MR):
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L}, vb${ABC[N:N+4]}c${L});
-        $else:
-           $for N in range(0, NR, 4):
-             $for M in range(MR):
-               vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[N:N+4]}c${L}, va${M}, ${L});
+              const float32x4_t va${M}c${L} = vdupq_lane_f32(va${M}, ${L});
+            $for N in range(0, NR, 4):
+              $for M in range(MR):
+                vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}c${L},   vb${ABC[N:N+4]}c${L});
+          $else:
+             $for N in range(0, NR, 4):
+               $for M in range(MR):
+                 vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_LANE_F32}(vacc${M}x${ABC[N:N+4]}, vb${ABC[  N:N+4]}c${L}, va${M}, ${L});
 
       }
       if XNN_UNLIKELY(k != 0) {
@@ -104,10 +104,7 @@
 
         $for N in range(0, NR, 4):
           $for M in range(MR):
-            $if FMA:
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
-            $else:
-              vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
+            vacc${M}x${ABC[N:N+4]} = ${VMULADDQ_F32}(vacc${M}x${ABC[N:N+4]}, va${M}, vb${ABC[N:N+4]});
       }
       p -= ${MR} * sizeof(void*);
     } while (p != 0);
diff --git a/src/init.c b/src/init.c
index b5795af..bbc162f 100644
--- a/src/init.c
+++ b/src/init.c
@@ -132,16 +132,16 @@
   /**************************** F32 micro-kernels ****************************/
   #ifndef XNN_NO_F32_OPERATORS
     xnn_params.f32.gemm = (struct gemm_parameters) {
-      .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_ld128,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_ld128,
-      .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_ld64,
-      .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_ld64,
+      .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
+      .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+      .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
       .mr = 4,
       .nr = 8,
     };
     xnn_params.f32.gemm2 = (struct gemm_parameters) {
       .gemm = NULL,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neon_ld64,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neon_lane_ld64,
       .mr = 4,
       .nr = 2,
     };
@@ -372,8 +372,8 @@
           break;
         default:
           xnn_params.f32.gemm = (struct gemm_parameters) {
-            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_ld64,
-            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
+            .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+            .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
             .mr = 6,
@@ -383,10 +383,10 @@
       }
     #else  // XNN_ENABLE_ASSEMBLY
       xnn_params.f32.gemm = (struct gemm_parameters) {
-        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_ld64,
-        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_ld64,
-        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_ld64,
-        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_ld64,
+        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
         .mr = 6,
         .nr = 8,
       };
@@ -394,7 +394,7 @@
 
     xnn_params.f32.gemm2 = (struct gemm_parameters) {
       .gemm = NULL,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neonfma_ld64,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64,
       .mr = 4,
       .nr = 2,
     };
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 4f16309..bf801e1 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -39,8 +39,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__sse_dup)
@@ -51,8 +51,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__sse)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
@@ -62,10 +62,10 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld128)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__sse_dup)
@@ -77,8 +77,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
@@ -87,8 +87,8 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__avx_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__psimd_splat)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8s4__neon)
@@ -122,8 +122,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__sse_dup)
@@ -142,10 +142,10 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld128)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__sse_dup)
@@ -157,8 +157,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
@@ -167,8 +167,8 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__avx_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__psimd_splat)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8s4__neon)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 5faf766..6ce4e92 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -41,8 +41,8 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_dup)
@@ -53,22 +53,22 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__sse)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__psimd)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__sse)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld128)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_dup)
@@ -86,8 +86,8 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__avx_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__fma3_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__psimd_splat)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8s4__neon)