4x4 LD64 GEMM microkernel in AArch32+VFP assembly

PiperOrigin-RevId: 312008451
diff --git a/BUILD.bazel b/BUILD.bazel
index 275ff2a..7ae306b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1561,6 +1561,7 @@
 ]
 
 AARCH32_ASM_UKERNELS = [
+    "src/f32-gemm/4x4-aarch32-vfp-ld64.S",
     "src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S",
     "src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S",
     "src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5766b2c..282c0f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1560,6 +1560,7 @@
   src/math/extexp-avx512f-p5.c)
 
 SET(XNNPACK_AARCH32_ASM_MICROKERNEL_SRCS
+  src/f32-gemm/4x4-aarch32-vfp-ld64.S
   src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S
   src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S
   src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S
diff --git a/src/f32-gemm/4x4-aarch32-vfp-ld64.S b/src/f32-gemm/4x4-aarch32-vfp-ld64.S
new file mode 100644
index 0000000..8f013bc
--- /dev/null
+++ b/src/f32-gemm/4x4-aarch32-vfp-ld64.S
@@ -0,0 +1,223 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64(
+//     size_t mr,                            r0
+//     size_t nc,                            r1
+//     size_t kc,                            r2 -> r5
+//     const uint8_t*restrict a,             r3
+//     size_t a_stride,          sp + 96  -> (r11)
+//     const void*restrict w,    sp + 100 -> r9
+//     uint8_t*restrict c,       sp + 104 -> r6
+//     size_t cm_stride,         sp + 108 -> (r7)
+//     size_t cn_stride,         sp + 112 -> r11
+//     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])  sp + 116 -> (r11)
+
+
+// inner loop registers
+
+// A0   r3  s0-s1  d0
+// A1  r12  s2-s3  d1
+// A2  r10  s4-s5  d2
+// A3   r0  s6-s7  d3
+
+// B    r9   s8,  s9, s10, s11 d4-d5
+// B        s12, s13, s14, s15 d6-d7
+
+// C0   r6 s16-s17  d8  s18-s19  d9
+// C1   r4 s20-s21 d10  s22-s23 d11
+// C2   r8 s24-s25 d12  s26-s27 d13
+// C3   r7 s28-s29 d14  s30-s31 d15
+
+BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
+        .arm
+#ifndef __APPLE__
+        .arch armv6
+        .fpu vfp
+#endif
+        // Push 96 bytes
+        PUSH   {r4, r5, r6, r7, r8, r9, r10, r11}  // 32
+        VPUSH  {d8-d15}                            // +64 = 96
+
+        LDR     r11, [sp, 96]         // Load a_stride
+        LDRD    r6, r7, [sp, 104]     // Load c and cm_stride
+
+        // Clamp A and C pointers
+        CMP    r0, 2                 // if mr >= 2
+        ADD    r12, r3, r11          //   a1 = a0 + a_stride
+        ADD    r4, r6, r7            //   c1 = c0 + cm_stride
+        MOVLO  r12, r3               // a1
+        MOVLO  r4, r6                // c1
+
+        LDR     r9, [sp, 100]        // Load w
+
+                                     // if mr > 2
+        ADD    r10, r12, r11         //   a2 = a1 + a_stride
+        ADD    r8, r4, r7            //   c2 = c1 + cm_stride
+        MOVLS  r10, r12              // a2
+        MOVLS  r8, r4                // c2
+
+        CMP    r0, 4                 // if mr >=4
+        ADD    r0, r10, r11          //   a3 = a2 + a_stride
+        ADD    r7, r8, r7            //   c3 = c2 + cm_stride
+        LDR    r11, [sp, 112]        // Load cn_stride
+        MOVLO  r0, r10               // a3
+        MOVLO  r7, r8                // c3
+
+1:
+        # Load initial bias from w into accumulators
+        VLDM        r9!, {d8-d9}   // Bias
+        SUBS        r5, r2, 8
+        VMOV.F64    d10, d8
+        VMOV.F64    d12, d8
+        VMOV.F64    d14, d8
+        VMOV.F64    d11, d9
+        VMOV.F64    d13, d9
+        VMOV.F64    d15, d9
+        BLO         5f               // less than 2 channels?
+
+        // Main loop - 2 floats of A (8 bytes)
+2:
+        VLDM        r3!, {d0}        // A0
+        VLDM        r9!, {d4-d5}     // B0
+        VLDM       r12!, {d1}        // A1
+        VLDM       r10!, {d2}        // A2
+        VLDM        r0!, {d3}        // A3
+
+        VMLA.F32    s16, s8, s0
+        VMLA.F32    s17, s9, s0
+        VMLA.F32    s20, s8, s2
+        VMLA.F32    s21, s9, s2
+        VMLA.F32    s24, s8, s4
+        VMLA.F32    s25, s9, s4
+        VMLA.F32    s28, s8, s6
+        VMLA.F32    s29, s9, s6
+
+        VLDM        r9!, {d6-d7}     // B1
+
+        VMLA.F32    s18, s10, s0
+        VMLA.F32    s19, s11, s0
+        VMLA.F32    s22, s10, s2
+        VMLA.F32    s23, s11, s2
+        VMLA.F32    s26, s10, s4
+        VMLA.F32    s27, s11, s4
+        VMLA.F32    s30, s10, s6
+        VMLA.F32    s31, s11, s6
+
+        VMLA.F32    s16, s12, s1
+        VMLA.F32    s17, s13, s1
+        VMLA.F32    s20, s12, s3
+        VMLA.F32    s21, s13, s3
+        VMLA.F32    s24, s12, s5
+        VMLA.F32    s25, s13, s5
+        VMLA.F32    s28, s12, s7
+        VMLA.F32    s29, s13, s7
+
+        SUBS        r5, r5, 8
+
+        VMLA.F32    s18, s14, s1
+        VMLA.F32    s19, s15, s1
+        VMLA.F32    s22, s14, s3
+        VMLA.F32    s23, s15, s3
+        VMLA.F32    s26, s14, s5
+        VMLA.F32    s27, s15, s5
+        VMLA.F32    s30, s14, s7
+        VMLA.F32    s31, s15, s7
+
+        BHS         2b
+
+        // Is there a remainder?- 1 floats of A (4 bytes)
+        TST         r5, 4
+        BNE         5f
+
+4:
+
+        SUBS        r1, r1, 4
+        BLO         10f
+
+        // Store full 4 x 4
+        VSTM        r6, {d8-d9}
+        SUB         r0, r0, r2
+        ADD         r6, r11
+        VSTM        r4, {d10-d11}
+        SUB         r10, r10, r2
+        ADD         r4, r11
+        VSTM        r8, {d12-d13}
+        SUB         r12, r12, r2
+        ADD         r8, r11
+        VSTM        r7, {d14-d15}
+        SUB         r3, r3, r2
+        ADD         r7, r11
+        BHI         1b
+
+        VPOP        {d8-d15}
+        POP         {r4, r5, r6, r7, r8, r9, r10, r11}
+        BX          lr
+
+5:
+        // Remainder- 1 floats of A (4 bytes)
+        VLDM         r3!, {s0}       // A0
+        VLDM         r9!, {d6-d7}    // B
+        VLDM        r12!, {s1}       // A1
+        VLDM        r10!, {s2}       // A2
+        VLDM         r0!, {s3}       // A3
+
+        VMLA.F32    s16, s12, s0
+        VMLA.F32    s17, s13, s0
+        VMLA.F32    s18, s14, s0
+        VMLA.F32    s19, s15, s0
+
+        VMLA.F32    s20, s12, s1
+        VMLA.F32    s21, s13, s1
+        VMLA.F32    s22, s14, s1
+        VMLA.F32    s23, s15, s1
+
+        VMLA.F32    s24, s12, s2
+        VMLA.F32    s25, s13, s2
+        VMLA.F32    s26, s14, s2
+        VMLA.F32    s27, s15, s2
+
+        VMLA.F32    s28, s12, s3
+        VMLA.F32    s29, s13, s3
+        VMLA.F32    s30, s14, s3
+        VMLA.F32    s31, s15, s3
+
+        B           4b
+
+        // Store odd width
+10:
+        TST        r1, 2
+        BEQ        11f
+        VSTM       r6!, {d8}
+        VMOV.F32   s16, s18
+        VSTM       r4!, {d10}
+        VMOV.F32   s20, s22
+        VSTM       r8!, {d12}
+        VMOV.F32   s24, s26
+        VSTM       r7!, {d14}
+        VMOV.F32   s28, s30
+
+11:
+        TST         r1, 1
+        BEQ         12f
+        VSTR        s16, [r6]
+        VSTR        s20, [r4]
+        VSTR        s24, [r8]
+        VSTR        s28, [r7]
+
+12:
+        VPOP        {d8-d15}
+        POP         {r4, r5, r6, r7, r8, r9, r10, r11}
+        BX          lr
+
+END_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 6376487..c3a71b7 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -83,6 +83,8 @@
 DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma)
 DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma)
 
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64)
+
 DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64)
 DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64)
 DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53)
diff --git a/test/f32-gemm-minmax.cc b/test/f32-gemm-minmax.cc
index 070d189..7b565d4 100644
--- a/test/f32-gemm-minmax.cc
+++ b/test/f32-gemm-minmax.cc
@@ -11455,7 +11455,7 @@
 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
 
 
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#if XNN_ARCH_ARM
   TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_LD64, k_eq_2) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
@@ -11908,7 +11908,7 @@
       .cm_stride(11)
       .Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64);
   }
-#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#endif  // XNN_ARCH_ARM
 
 
 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index 7e28ca9..a0650b4 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -22,6 +22,409 @@
 #include "gemm-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2) {
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(2)
+      .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, strided_cn) {
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(2)
+      .cn_stride(7)
+      .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_strided_a) {
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(2)
+      .a_stride(5)
+      .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_subtile) {
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 4; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(2)
+          .iterations(1)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_subtile_m) {
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(4)
+        .k(2)
+        .iterations(1)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_subtile_n) {
+    for (uint32_t n = 1; n <= 4; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(2)
+        .iterations(1)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_lt_2) {
+    for (size_t k = 1; k < 2; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_lt_2_strided_a) {
+    for (size_t k = 1; k < 2; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(5)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_lt_2_subtile) {
+    for (size_t k = 1; k < 2; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_gt_2) {
+    for (size_t k = 3; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_gt_2_strided_a) {
+    for (size_t k = 3; k < 4; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(7)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_gt_2_subtile) {
+    for (size_t k = 3; k < 4; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_div_2) {
+    for (size_t k = 4; k <= 20; k += 2) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_div_2_strided_a) {
+    for (size_t k = 4; k <= 20; k += 2) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(4)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(4)
+        .k(k)
+        .a_stride(23)
+        .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_div_2_subtile) {
+    for (size_t k = 4; k <= 20; k += 2) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4) {
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4_strided_cn) {
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4_strided_a) {
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(13)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4_subtile) {
+    for (uint32_t n = 5; n < 8; n++) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4) {
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(4)
+          .k(k)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4_strided_cn) {
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(7)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4_strided_a) {
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(4)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(13)
+          .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4_subtile) {
+    for (uint32_t n = 8; n <= 12; n += 4) {
+      for (size_t k = 1; k <= 10; k += 3) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, strided_cm_subtile) {
+    for (size_t k = 1; k <= 10; k += 3) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 4; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(4)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(7)
+            .iterations(1)
+            .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+        }
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, strided_cm) {
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(4)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(4)
+      .k(2)
+      .cm_stride(7)
+      .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+  }
+#endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
+
 #if XNN_ARCH_WASM
   TEST(F32_GEMM_1X4__WASM, k_eq_1) {
     GemmMicrokernelTester()
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index a109785..102f9d7 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -2,6 +2,9 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+- name: xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
+  k-block: 2
+  assembly: true
 - name: xnn_f32_gemm_ukernel_1x4__wasm
   k-block: 1
 - name: xnn_f32_gemm_ukernel_2x4__wasm