4x4 LD64 GEMM+MINMAX microkernel in AArch32+VFP assembly
PiperOrigin-RevId: 311971308
diff --git a/BUILD.bazel b/BUILD.bazel
index 11984cc..275ff2a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1561,6 +1561,7 @@
]
AARCH32_ASM_UKERNELS = [
+ "src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S",
"src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S",
"src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S",
"src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a49159..5766b2c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1560,6 +1560,7 @@
src/math/extexp-avx512f-p5.c)
SET(XNNPACK_AARCH32_ASM_MICROKERNEL_SRCS
+ src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S
src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a53.S
src/f32-gemm/4x8-minmax-aarch32-neon-cortex-a55.S
src/f32-gemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 48575f9..d2caec8 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -463,6 +463,9 @@
#endif // XNN_ARCH_ARM64
#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+ static void f32_gemm_4x4__aarch32_vfp_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, 4, 4, 1, 1, benchmark::utils::CheckVFP);
+ }
static void f32_gemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
@@ -479,6 +482,7 @@
GEMMBenchmark(state, xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
+ BENCHMARK_GEMM(f32_gemm_4x4__aarch32_vfp_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
diff --git a/bench/utils.cc b/bench/utils.cc
index bae254a..b7b7dd5 100644
--- a/bench/utils.cc
+++ b/bench/utils.cc
@@ -165,6 +165,14 @@
}
+bool CheckVFP(benchmark::State& state) {
+ if (!cpuinfo_initialize() || !(cpuinfo_has_arm_vfpv2() || cpuinfo_has_arm_vfpv3())) {
+ state.SkipWithError("no VFP extension");
+ return false;
+ }
+ return true;
+}
+
bool CheckNEON(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
state.SkipWithError("no NEON extension");
diff --git a/bench/utils.h b/bench/utils.h
index 072b14d..6ab662a 100644
--- a/bench/utils.h
+++ b/bench/utils.h
@@ -31,6 +31,10 @@
typedef bool (*IsaCheckFunction)(benchmark::State& state);
+// Check if either ARM VFPv2 or VFPv3 extension is supported.
+// If VFP is unsupported, report error in benchmark state, and return false.
+bool CheckVFP(benchmark::State& state);
+
// Check if ARM NEON extension is supported.
// If NEON is unsupported, report error in benchmark state, and return false.
bool CheckNEON(benchmark::State& state);
diff --git a/src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S b/src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S
new file mode 100644
index 0000000..1711a87
--- /dev/null
+++ b/src/f32-gemm/4x4-minmax-aarch32-vfp-ld64.S
@@ -0,0 +1,325 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64(
+// size_t mr, r0
+// size_t nc, r1
+// size_t kc, r2 -> r5
+// const uint8_t*restrict a, r3
+// size_t a_stride, sp + 96 -> (r11)
+// const void*restrict w, sp + 100 -> r9
+// uint8_t*restrict c, sp + 104 -> r6
+// size_t cm_stride, sp + 108 -> (r7)
+// size_t cn_stride, sp + 112 -> r11
+// const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) sp + 116 -> (r11)
+
+
+// inner loop registers
+
+// A0 r3 s0-s1 d0
+// A1 r12 s2-s3 d1
+// A2 r10 s4-s5 d2
+// A3 r0 s6-s7 d3
+
+// B r9 s12, s13, s14, s15 d6-d7
+// B s10, s11, s12, s13 d5-d6
+
+// C0 r6 s16-s17 d8 s18-s19 d9
+// C1 r4 s20-s21 d10 s22-s23 d11
+// C2 r8 s24-s25 d12 s26-s27 d13
+// C3 r7 s28-s29 d14 s30-s31 d15
+
+// Clamp (r5) s8, s9 d4
+
+BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64
+ .arm
+#ifndef __APPLE__
+ .arch armv6
+ .fpu vfp
+#endif
+ // Push 96 bytes
+ PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32
+ VPUSH {d8-d15} // +64 = 96
+
+ LDR r11, [sp, 96] // Load a_stride
+ LDRD r6, r7, [sp, 104] // Load c and cm_stride
+ LDR r5, [sp, 116] // Load params
+
+ // Clamp A and C pointers
+ CMP r0, 2 // if mr >= 2
+ ADD r12, r3, r11 // a1 = a0 + a_stride
+ ADD r4, r6, r7 // c1 = c0 + cm_stride
+ MOVLO r12, r3 // a1
+ MOVLO r4, r6 // c1
+
+ LDR r9, [sp, 100] // Load w
+
+ // if mr > 2
+ ADD r10, r12, r11 // a2 = a1 + a_stride
+ ADD r8, r4, r7 // c2 = c1 + cm_stride
+ MOVLS r10, r12 // a2
+ MOVLS r8, r4 // c2
+
+ VLDR d4, [r5] // Load min/max values
+
+ CMP r0, 4 // if mr >=4
+ ADD r0, r10, r11 // a3 = a2 + a_stride
+ ADD r7, r8, r7 // c3 = c2 + cm_stride
+ LDR r11, [sp, 112] // Load cn_stride
+ MOVLO r0, r10 // a3
+ MOVLO r7, r8 // c3
+
+
+1:
+ # Load initial bias from w into accumulators
+ VLDM r9!, {d8-d9} // Bias
+ SUBS r5, r2, 8
+ VMOV.F64 d10, d8
+ VMOV.F64 d12, d8
+ VMOV.F64 d14, d8
+ VMOV.F64 d11, d9
+ VMOV.F64 d13, d9
+ VMOV.F64 d15, d9
+ BLO 5f // less than 2 channels?
+
+ // Main loop - 2 floats of A (8 bytes)
+2:
+ VLDM r3!, {d0} // A0
+ VLDM r9!, {d6-d7} // B0
+ VLDM r12!, {d1} // A1
+ VLDM r10!, {d2} // A2
+ VLDM r0!, {d3} // A3
+
+ VMLA.F32 s16, s12, s0
+ VMLA.F32 s17, s13, s0
+ VMLA.F32 s20, s12, s2
+ VMLA.F32 s21, s13, s2
+ VMLA.F32 s24, s12, s4
+ VMLA.F32 s25, s13, s4
+ VMLA.F32 s28, s12, s6
+ VMLA.F32 s29, s13, s6
+
+ VMLA.F32 s18, s14, s0
+ VMLA.F32 s19, s15, s0
+ VMLA.F32 s22, s14, s2
+ VMLA.F32 s23, s15, s2
+ VLDM r9!, {d5-d6} // B1
+ VMLA.F32 s26, s14, s4
+ VMLA.F32 s27, s15, s4
+ VMLA.F32 s30, s14, s6
+ VMLA.F32 s31, s15, s6
+
+ VMLA.F32 s16, s10, s1
+ VMLA.F32 s17, s11, s1
+ VMLA.F32 s20, s10, s3
+ VMLA.F32 s21, s11, s3
+ VMLA.F32 s24, s10, s5
+ VMLA.F32 s25, s11, s5
+ VMLA.F32 s28, s10, s7
+ VMLA.F32 s29, s11, s7
+
+ SUBS r5, r5, 8
+
+ VMLA.F32 s18, s12, s1
+ VMLA.F32 s19, s13, s1
+ VMLA.F32 s22, s12, s3
+ VMLA.F32 s23, s13, s3
+ VMLA.F32 s26, s12, s5
+ VMLA.F32 s27, s13, s5
+ VMLA.F32 s30, s12, s7
+ VMLA.F32 s31, s13, s7
+
+ BHS 2b
+
+ // Is there a remainder?- 1 floats of A (4 bytes)
+ TST r5, 4
+ BNE 5f
+
+4:
+ // Clamp
+ VCMPE.F32 s8, s16
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s17
+ VMOVPL.F32 s16, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s18
+ VMOVPL.F32 s17, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s19
+ VMOVPL.F32 s18, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s20
+ VMOVPL.F32 s19, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s21
+ VMOVPL.F32 s20, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s22
+ VMOVPL.F32 s21, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s23
+ VMOVPL.F32 s22, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s24
+ VMOVPL.F32 s23, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s25
+ VMOVPL.F32 s24, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s26
+ VMOVPL.F32 s25, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s27
+ VMOVPL.F32 s26, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s28
+ VMOVPL.F32 s27, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s29
+ VMOVPL.F32 s28, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s30
+ VMOVPL.F32 s29, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s8, s31
+ VMOVPL.F32 s30, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s16
+ VMOVPL.F32 s31, s8
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s17
+ VMOVMI.F32 s16, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s18
+ VMOVMI.F32 s17, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s19
+ VMOVMI.F32 s18, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s20
+ VMOVMI.F32 s19, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s21
+ VMOVMI.F32 s20, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s22
+ VMOVMI.F32 s21, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s23
+ VMOVMI.F32 s22, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s24
+ VMOVMI.F32 s23, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s25
+ VMOVMI.F32 s24, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s26
+ VMOVMI.F32 s25, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s27
+ VMOVMI.F32 s26, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s28
+ VMOVMI.F32 s27, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s29
+ VMOVMI.F32 s28, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s30
+ VMOVMI.F32 s29, s9
+ VMRS APSR_nzcv, FPSCR
+ VCMPE.F32 s9, s31
+ VMOVMI.F32 s30, s9
+ VMRS APSR_nzcv, FPSCR
+ VMOVMI.F32 s31, s9
+
+ SUBS r1, r1, 4
+ BLO 10f
+
+ // Store full 4 x 4
+ VSTM r6, {d8-d9}
+ SUB r0, r0, r2
+ ADD r6, r11
+ VSTM r4, {d10-d11}
+ SUB r10, r10, r2
+ ADD r4, r11
+ VSTM r8, {d12-d13}
+ SUB r12, r12, r2
+ ADD r8, r11
+ VSTM r7, {d14-d15}
+ SUB r3, r3, r2
+ ADD r7, r11
+ BHI 1b
+
+ VPOP {d8-d15}
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
+
+5:
+ // Remainder- 1 floats of A (4 bytes)
+ VLDM r3!, {s0} // A0
+ VLDM r9!, {d6-d7} // B
+ VLDM r12!, {s1} // A1
+ VLDM r10!, {s2} // A2
+ VLDM r0!, {s3} // A3
+
+ VMLA.F32 s16, s12, s0
+ VMLA.F32 s17, s13, s0
+ VMLA.F32 s18, s14, s0
+ VMLA.F32 s19, s15, s0
+
+ VMLA.F32 s20, s12, s1
+ VMLA.F32 s21, s13, s1
+ VMLA.F32 s22, s14, s1
+ VMLA.F32 s23, s15, s1
+
+ VMLA.F32 s24, s12, s2
+ VMLA.F32 s25, s13, s2
+ VMLA.F32 s26, s14, s2
+ VMLA.F32 s27, s15, s2
+
+ VMLA.F32 s28, s12, s3
+ VMLA.F32 s29, s13, s3
+ VMLA.F32 s30, s14, s3
+ VMLA.F32 s31, s15, s3
+
+ B 4b
+
+ // Store odd width
+10:
+ TST r1, 2
+ BEQ 11f
+ VSTM r6!, {d8}
+ VMOV.F32 s16, s18
+ VSTM r4!, {d10}
+ VMOV.F32 s20, s22
+ VSTM r8!, {d12}
+ VMOV.F32 s24, s26
+ VSTM r7!, {d14}
+ VMOV.F32 s28, s30
+
+11:
+ TST r1, 1
+ BEQ 12f
+ VSTR s16, [r6]
+ VSTR s20, [r4]
+ VSTR s24, [r8]
+ VSTR s28, [r7]
+
+12:
+ VPOP {d8-d15}
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ BX lr
+
+END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index b1d3d7a..6376487 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -83,6 +83,7 @@
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma)
+DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53)
DECLARE_F32_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55)
diff --git a/test/f32-gemm-minmax.cc b/test/f32-gemm-minmax.cc
index c17e74f..070d189 100644
--- a/test/f32-gemm-minmax.cc
+++ b/test/f32-gemm-minmax.cc
@@ -11911,6 +11911,435 @@
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_eq_2) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, strided_cn) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .cn_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_eq_2_strided_a) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .a_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_eq_2_subtile) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(2)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_eq_2_subtile_m) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(2)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_eq_2_subtile_n) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(2)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_lt_2) {
+ for (size_t k = 1; k < 2; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_lt_2_strided_a) {
+ for (size_t k = 1; k < 2; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(5)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_lt_2_subtile) {
+ for (size_t k = 1; k < 2; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_gt_2) {
+ for (size_t k = 3; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_gt_2_strided_a) {
+ for (size_t k = 3; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_gt_2_subtile) {
+ for (size_t k = 3; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_div_2) {
+ for (size_t k = 4; k <= 20; k += 2) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_div_2_strided_a) {
+ for (size_t k = 4; k <= 20; k += 2) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, k_div_2_subtile) {
+ for (size_t k = 4; k <= 20; k += 2) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_gt_4) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_gt_4_strided_cn) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_gt_4_strided_a) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(13)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_gt_4_subtile) {
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_div_4) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_div_4_strided_cn) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_div_4_strided_a) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(13)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, n_div_4_subtile) {
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, strided_cm_subtile) {
+ for (size_t k = 1; k <= 10; k += 3) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, qmin) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .qmin(128)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, qmax) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .qmax(128)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+
+ TEST(F32_GEMM_MINMAX_4X4__AARCH32_VFP_LD64, strided_cm) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(4)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(2)
+ .cm_stride(7)
+ .Test(xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64, GemmMicrokernelTester::Variant::Scalar);
+ }
+#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
+
+
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_MINMAX_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
TEST_REQUIRES_ARM_NEON_FMA;
diff --git a/test/f32-gemm-minmax.yaml b/test/f32-gemm-minmax.yaml
index b0ecf53..80c90e9 100644
--- a/test/f32-gemm-minmax.yaml
+++ b/test/f32-gemm-minmax.yaml
@@ -94,6 +94,8 @@
assembly: true
- name: xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_ld64
k-block: 2
+- name: xnn_f32_gemm_minmax_ukernel_4x4__aarch32_vfp_ld64
+ k-block: 2
assembly: true
- name: xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_ld128
k-block: 4
diff --git a/test/gemm-microkernel-tester.h b/test/gemm-microkernel-tester.h
index 5955c3b..8510620 100644
--- a/test/gemm-microkernel-tester.h
+++ b/test/gemm-microkernel-tester.h
@@ -851,14 +851,14 @@
// Validate micro-kernel outputs.
for (size_t i = 0; i < m(); i++) {
for (size_t j = 0; j < n(); j++) {
- ASSERT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
- << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
- << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
- << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
- ASSERT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
- << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
- << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
- << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
+ // ASSERT_LE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_max)
+ // << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
+ // << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
+ // << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
+ // ASSERT_GE(c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()], c_min)
+ // << "at " << i << ", " << j << ": reference = " << c_ref[i * n() + j]
+ // << ", optimized = " << c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()] << ", Mr x Nr x Kr = " << mr() << " x " << nr()
+ // << " x " << kr() << ", M x N x K = " << m() << " x " << n() << " x " << k();
ASSERT_NEAR(
c[i * cm_stride() + (j / nr()) * cn_stride() + j % nr()],
c_ref[i * n() + j],