4x4 LD64 GEMM microkernel in AArch32+VFP assembly
PiperOrigin-RevId: 312008451
diff --git a/BUILD.bazel b/BUILD.bazel
index 275ff2a..7ae306b 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1561,6 +1561,7 @@
]
AARCH32_ASM_UKERNELS = [
+ "src/f32-gemm/4x4-aarch32-vfp-ld64.S",
"src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S",
"src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S",
"src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5766b2c..282c0f4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1560,6 +1560,7 @@
src/math/extexp-avx512f-p5.c)
SET(XNNPACK_AARCH32_ASM_MICROKERNEL_SRCS
+ src/f32-gemm/4x4-aarch32-vfp-ld64.S
src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S
src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S
src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S
diff --git a/src/f32-gemm/4x4-aarch32-vfp-ld64.S b/src/f32-gemm/4x4-aarch32-vfp-ld64.S
new file mode 100644
index 0000000..8f013bc
--- /dev/null
+++ b/src/f32-gemm/4x4-aarch32-vfp-ld64.S
@@ -0,0 +1,223 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64(
+// size_t mr, r0
+// size_t nc, r1
+// size_t kc, r2 -> r5
+// const uint8_t*restrict a, r3
+// size_t a_stride, sp + 96 -> (r11)
+// const void*restrict w, sp + 100 -> r9
+// uint8_t*restrict c, sp + 104 -> r6
+// size_t cm_stride, sp + 108 -> (r7)
+// size_t cn_stride, sp + 112 -> r11
+// const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) sp + 116 -> (r11)
+
+
+// inner loop registers
+
+// A0 r3 s0-s1 d0
+// A1 r12 s2-s3 d1
+// A2 r10 s4-s5 d2
+// A3 r0 s6-s7 d3
+
+// B r9 s8, s9, s10, s11 d4-d5
+// B s12, s13, s14, s15 d6-d7
+
+// C0 r6 s16-s17 d8 s18-s19 d9
+// C1 r4 s20-s21 d10 s22-s23 d11
+// C2 r8 s24-s25 d12 s26-s27 d13
+// C3 r7 s28-s29 d14 s30-s31 d15
+
+BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
+ .arm
+#ifndef __APPLE__
+ .arch armv6
+ .fpu vfp
+#endif
+ // Push 96 bytes
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ VPUSH {d8-d15} // +64 = 96
+
+ LDR r11, [sp, 96] // Load a_stride
+ LDRD r6, r7, [sp, 104] // Load c and cm_stride
+
+ // Clamp A and C pointers
+ CMP r0, 2 // if mr >= 2
+ ADD r12, r3, r11 // a1 = a0 + a_stride
+ ADD r4, r6, r7 // c1 = c0 + cm_stride
+ MOVLO r12, r3 // a1
+ MOVLO r4, r6 // c1
+
+ LDR r9, [sp, 100] // Load w
+
+ // if mr > 2
+ ADD r10, r12, r11 // a2 = a1 + a_stride
+ ADD r8, r4, r7 // c2 = c1 + cm_stride
+ MOVLS r10, r12 // a2
+ MOVLS r8, r4 // c2
+
+ CMP r0, 4 // if mr >=4
+ ADD r0, r10, r11 // a3 = a2 + a_stride
+ ADD r7, r8, r7 // c3 = c2 + cm_stride
+ LDR r11, [sp, 112] // Load cn_stride
+ MOVLO r0, r10 // a3
+ MOVLO r7, r8 // c3
+
+1:
+ # Load initial bias from w into accumulators
+ VLDM r9!, {d8-d9} // Bias
+ SUBS r5, r2, 8
+ VMOV.F64 d10, d8
+ VMOV.F64 d12, d8
+ VMOV.F64 d14, d8
+ VMOV.F64 d11, d9
+ VMOV.F64 d13, d9
+ VMOV.F64 d15, d9
+ BLO 5f // less than 2 channels?
+
+ // Main loop - 2 floats of A (8 bytes)
+2:
+ VLDM r3!, {d0} // A0
+ VLDM r9!, {d4-d5} // B0
+ VLDM r12!, {d1} // A1
+ VLDM r10!, {d2} // A2
+ VLDM r0!, {d3} // A3
+
+ VMLA.F32 s16, s8, s0
+ VMLA.F32 s17, s9, s0
+ VMLA.F32 s20, s8, s2
+ VMLA.F32 s21, s9, s2
+ VMLA.F32 s24, s8, s4
+ VMLA.F32 s25, s9, s4
+ VMLA.F32 s28, s8, s6
+ VMLA.F32 s29, s9, s6
+
+ VLDM r9!, {d6-d7} // B1
+
+ VMLA.F32 s18, s10, s0
+ VMLA.F32 s19, s11, s0
+ VMLA.F32 s22, s10, s2
+ VMLA.F32 s23, s11, s2
+ VMLA.F32 s26, s10, s4
+ VMLA.F32 s27, s11, s4
+ VMLA.F32 s30, s10, s6
+ VMLA.F32 s31, s11, s6
+
+ VMLA.F32 s16, s12, s1
+ VMLA.F32 s17, s13, s1
+ VMLA.F32 s20, s12, s3
+ VMLA.F32 s21, s13, s3
+ VMLA.F32 s24, s12, s5
+ VMLA.F32 s25, s13, s5
+ VMLA.F32 s28, s12, s7
+ VMLA.F32 s29, s13, s7
+
+ SUBS r5, r5, 8
+
+ VMLA.F32 s18, s14, s1
+ VMLA.F32 s19, s15, s1
+ VMLA.F32 s22, s14, s3
+ VMLA.F32 s23, s15, s3
+ VMLA.F32 s26, s14, s5
+ VMLA.F32 s27, s15, s5
+ VMLA.F32 s30, s14, s7
+ VMLA.F32 s31, s15, s7
+
+ BHS 2b
+
+ // Is there a remainder?- 1 floats of A (4 bytes)
+ TST r5, 4
+ BNE 5f
+
+4:
+
+ SUBS r1, r1, 4
+ BLO 10f
+
+ // Store full 4 x 4
+ VSTM r6, {d8-d9}
+ SUB r0, r0, r2
+ ADD r6, r11
+ VSTM r4, {d10-d11}
+ SUB r10, r10, r2
+ ADD r4, r11
+ VSTM r8, {d12-d13}
+ SUB r12, r12, r2
+ ADD r8, r11
+ VSTM r7, {d14-d15}
+ SUB r3, r3, r2
+ ADD r7, r11
+ BHI 1b
+
+ VPOP {d8-d15}
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
+
+5:
+ // Remainder- 1 floats of A (4 bytes)
+ VLDM r3!, {s0} // A0
+ VLDM r9!, {d6-d7} // B
+ VLDM r12!, {s1} // A1
+ VLDM r10!, {s2} // A2
+ VLDM r0!, {s3} // A3
+
+ VMLA.F32 s16, s12, s0
+ VMLA.F32 s17, s13, s0
+ VMLA.F32 s18, s14, s0
+ VMLA.F32 s19, s15, s0
+
+ VMLA.F32 s20, s12, s1
+ VMLA.F32 s21, s13, s1
+ VMLA.F32 s22, s14, s1
+ VMLA.F32 s23, s15, s1
+
+ VMLA.F32 s24, s12, s2
+ VMLA.F32 s25, s13, s2
+ VMLA.F32 s26, s14, s2
+ VMLA.F32 s27, s15, s2
+
+ VMLA.F32 s28, s12, s3
+ VMLA.F32 s29, s13, s3
+ VMLA.F32 s30, s14, s3
+ VMLA.F32 s31, s15, s3
+
+ B 4b
+
+ // Store odd width
+10:
+ TST r1, 2
+ BEQ 11f
+ VSTM r6!, {d8}
+ VMOV.F32 s16, s18
+ VSTM r4!, {d10}
+ VMOV.F32 s20, s22
+ VSTM r8!, {d12}
+ VMOV.F32 s24, s26
+ VSTM r7!, {d14}
+ VMOV.F32 s28, s30
+
+11:
+ TST r1, 1
+ BEQ 12f
+ VSTR s16, [r6]
+ VSTR s20, [r4]
+ VSTR s24, [r8]
+ VSTR s28, [r7]
+
+12:
+ VPOP {d8-d15}
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
+
+END_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 6376487..c3a71b7 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -83,6 +83,8 @@
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64)
+
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53)
diff --git a/test/f32-gemm-minmax.cc b/test/f32-gemm-minmax.cc
index 070d189..7b565d4 100644
--- a/test/f32-gemm-minmax.cc
+++ b/test/f32-gemm-minmax.cc
@@ -11455,7 +11455,7 @@
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#if XNN_ARCH_ARM
TEST(F32_GEMM_MINMAX_4X8__AARCH32_NEON_LD64, k_eq_2) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
@@ -11908,7 +11908,7 @@
.cm_stride(11)
.Test(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64);
}
-#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#endif // XNN_ARCH_ARM
#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index 7e28ca9..a0650b4 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -22,6 +22,409 @@
#include "gemm-microkernel-tester.h"
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .cn_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .a_stride(5)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(2)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(2)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_eq_2_subtile_n) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(2)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_lt_2) {
+ for (size_t k = 1; k < 2; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_lt_2_strided_a) {
+ for (size_t k = 1; k < 2; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(5)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_lt_2_subtile) {
+ for (size_t k = 1; k < 2; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_gt_2) {
+ for (size_t k = 3; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_gt_2_strided_a) {
+ for (size_t k = 3; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_gt_2_subtile) {
+ for (size_t k = 3; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_div_2) {
+ for (size_t k = 4; k <= 20; k += 2) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_div_2_strided_a) {
+ for (size_t k = 4; k <= 20; k += 2) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, k_div_2_subtile) {
+ for (size_t k = 4; k <= 20; k += 2) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4_strided_cn) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4_strided_a) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(13)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_gt_4_subtile) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4_strided_cn) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4_strided_a) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(13)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, n_div_4_subtile) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, strided_cm_subtile) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X4__AARCH32_VFP_LD64, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .cm_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64);
+ }
+#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
+
#if XNN_ARCH_WASM
TEST(F32_GEMM_1X4__WASM, k_eq_1) {
GemmMicrokernelTester()
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index a109785..102f9d7 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -2,6 +2,9 @@
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
+ k-block: 2
+ assembly: true
- name: xnn_f32_gemm_ukernel_1x4__wasm
k-block: 1
- name: xnn_f32_gemm_ukernel_2x4__wasm