4x8 GEMM and IGEMM microkernels for AARCH32 Cortex A55. 11.5% faster end to end:
Was f32_gemm_4x8__aarch32_neon_cortex_a53/mobilenet_v2/real_time 154006 us
Now f32_gemm_4x8__aarch32_neon_cortex_a55/mobilenet_v2/real_time 138030 us
23% faster GEMM on mobilenet_v2
Was f32_gemm_4x8__aarch32_neon_cortex_a53 59909460
Now f32_gemm_4x8__aarch32_neon_cortex_a55 48681160
19.2% faster IGEMM on mobilenet_v2
Was f32_igemm_4x8__aarch32_neon_cortex_a53 67209225
Now f32_igemm_4x8__aarch32_neon_cortex_a55 56380323
End2end benchmark
Was
MobileNetV1/T:1/real_time 236793 us
MobileNetV2/T:1/real_time 154689 us
MobileNetV3Large/T:1/real_time 130964 us
MobileNetV3Small/T:1/real_time 42383 us
Now
MobileNetV1/T:1/real_time 199053 us
MobileNetV2/T:1/real_time 140262 us
MobileNetV3Large/T:1/real_time 120468 us
MobileNetV3Small/T:1/real_time 39952 us
The rev 1 version of Cortex A55 can co-issue a 64 bit
vector load with each FMA, so re-arrange the Cortex-A53
microkernel with 3 FMA paired with 2 loads.
Basic code block is 3 VMLA with 2 VLD64:
// BLOCK 0
VMLA.F32 q8, q4, d4[0]
VLD1.32 {d0}, [r3]! // A0
VMLA.F32 q10, q4, d5[0]
VLD1.32 {d1}, [r12]! // A1
VMLA.F32 q12, q4, d6[0]
PiperOrigin-RevId: 300384515
diff --git a/BUILD.bazel b/BUILD.bazel
index 49ef0b1..5caa15a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1407,6 +1407,7 @@
AARCH32_ASM_UKERNELS = [
"src/q8-dwconv/up8x9-aarch32-neon.S",
"src/f32-gemm/4x8-aarch32-neon-cortex-a53.S",
+ "src/f32-gemm/4x8-aarch32-neon-cortex-a55.S",
"src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S",
"src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S",
"src/f32-gemm/4x8-aarch32-neon-ld64.S",
@@ -1414,6 +1415,7 @@
"src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S",
"src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S",
"src/f32-igemm/4x8-aarch32-neon-cortex-a53.S",
+ "src/f32-igemm/4x8-aarch32-neon-cortex-a55.S",
]
AARCH64_ASM_UKERNELS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 90fb349..4bac898 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1398,13 +1398,15 @@
SET(XNNPACK_AARCH32_ASM_MICROKERNEL_SRCS
src/q8-dwconv/up8x9-aarch32-neon.S
src/f32-gemm/4x8-aarch32-neon-cortex-a53.S
+ src/f32-gemm/4x8-aarch32-neon-cortex-a55.S
src/f32-gemm/gen/4x8-aarch32-neon-cortex-a75.S
src/f32-gemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
src/f32-gemm/4x8-aarch32-neon-ld64.S
src/f32-igemm/4x8-aarch32-neon-ld64.S
src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S
src/f32-igemm/gen/4x8-aarch32-neon-pld-cortex-a75.S
- src/f32-igemm/4x8-aarch32-neon-cortex-a53.S)
+ src/f32-igemm/4x8-aarch32-neon-cortex-a53.S
+ src/f32-igemm/4x8-aarch32-neon-cortex-a55.S)
SET(XNNPACK_AARCH64_ASM_MICROKERNEL_SRCS
src/f32-dwconv/up4x9-aarch64-neonfma-cortex-a55.S
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index 2523fda..190d55c 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -283,6 +283,15 @@
4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
benchmark::utils::CheckNEON);
}
+ static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55,
+ xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55,
+ xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+ xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75,
@@ -304,6 +313,7 @@
BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_ld64);
BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a53);
+ BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a55);
BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_cortex_a75);
BENCHMARK_END2END(f32_gemm_4x8__aarch32_neon_pld_cortex_a75);
#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index 8788f23..c68578f 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -457,6 +457,9 @@
static void f32_gemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
+ static void f32_gemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1, benchmark::utils::CheckNEON);
+ }
static void f32_gemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1, benchmark::utils::CheckNEON);
}
@@ -466,6 +469,7 @@
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_ld64)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a53)
+ BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a55)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_cortex_a75)
BENCHMARK_GEMM(f32_gemm_4x8__aarch32_neon_pld_cortex_a75)
#endif // XNN_ARCH_ARM
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 10bcbee..1aa6fcd 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -262,8 +262,24 @@
static void f32_igemm_4x8__aarch32_neon_ld64(benchmark::State& state, const char* net) {
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64, 4, 8, 1, 1);
}
+ static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1);
+ }
+ static void f32_igemm_4x8__aarch32_neon_cortex_a55(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55, 4, 8, 1, 1);
+ }
+ static void f32_igemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1);
+ }
+ static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1);
+ }
BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_ld64)
+ BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
+ BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a55)
+ BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_pld_cortex_a75)
+ BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
#endif /* XNN_ARCH_ARM */
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
@@ -379,22 +395,6 @@
BENCHMARK_CONV(f32_igemm_6x8__neonfma_lane_ld128)
#endif /* XNN_ARCH_ARM64 */
-#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
- static void f32_igemm_4x8__aarch32_neon_pld_cortex_a75(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75, 4, 8, 1, 1);
- }
- static void f32_igemm_4x8__aarch32_neon_cortex_a75(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75, 4, 8, 1, 1);
- }
- static void f32_igemm_4x8__aarch32_neon_cortex_a53(benchmark::State& state, const char* net) {
- IGEMMBenchmark(state, xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53, 4, 8, 1, 1);
- }
-
- BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_pld_cortex_a75)
- BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a75)
- BENCHMARK_CONV(f32_igemm_4x8__aarch32_neon_cortex_a53)
-#endif /* XNN_ARCH_ARM */
-
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
static void f32_igemm_1x8__sse_load1(benchmark::State& state, const char* net) {
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__sse_load1, 1, 8, 1, 1);
diff --git a/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S b/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S
new file mode 100644
index 0000000..3f7b64e
--- /dev/null
+++ b/src/f32-gemm/4x8-aarch32-neon-cortex-a55.S
@@ -0,0 +1,426 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55(
+// size_t mr, r0
+// size_t nc, r1
+// size_t kc, r2 -> r5 -> sp + 0
+// const uint8_t*restrict a, r3
+// size_t a_stride, sp + 100 -> (r7)
+// const void*restrict w, sp + 104 -> r9
+// uint8_t*restrict c, sp + 108 -> r11
+// size_t cm_stride, sp + 112 -> (r6)
+// size_t cn_stride, sp + 116 -> (r0)
+// const union xnn_f32_output_params params[restrict static 1]) sp + 120 -> (r5)
+
+
+// inner loop registers
+// r0, r2 scratch temporaries for loads
+// r14 (lr) unused
+
+// A0 r3 d0
+// A1 r12 d1
+// A2 r10 d2
+// A3 r7 d3
+
+// B r9 d8, d9, d10, d11
+// B d12, d13, d14, d15
+
+// C0 r11 d16-d17 q8 d18-d19 q9
+// C1 r4 d20-d21 q10 d22-d23 q11
+// C2 r8 d24-d25 q12 d26-d27 q13
+// C3 r6 d28-d29 q14 d30-d31 q15
+
+// Clamp (r5) d4 d5 d6 d7
+
+BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ // Push 100 bytes
+ // r2 will be reloaded in outer loop
+ VPUSH {d8-d15} // 64
+ PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11} // +36 = 100
+
+ LDR r7, [sp, 100] // a_stride
+ LDR r11, [sp, 108] // c
+ LDR r6, [sp, 112] // cm_stride
+ LDR r9, [sp, 104] // w
+
+ // Clamp A and C pointers
+ CMP r0, 2 // if mr >= 2
+ ADD r12, r3, r7 // a1 = a0 + a_stride
+ ADD r4, r11, r6 // c1 = c0 + cm_stride
+ MOVLO r12, r3 // a1
+ MOVLO r4, r11 // c1
+ // if mr > 2
+ ADD r10, r12, r7 // a2 = a1 + a_stride
+ ADD r8, r4, r6 // c2 = c1 + cm_stride
+ MOVLS r10, r12 // a2
+ MOVLS r8, r4 // c2
+
+ CMP r0, 4 // if mr >=4
+ ADD r7, r10, r7 // a3 = a2 + a_stride
+ ADD r6, r8, r6 // c3 = c2 + cm_stride
+ MOVLO r7, r10 // a3
+ MOVLO r6, r8 // c3
+
+ .p2align 3
+1:
+ # Load initial bias from w into accumulators
+ VLDM r9!, {d16-d19} // Bias
+
+ SUBS r5, r2, 16 // kc - 16
+ PLD [r3, 0] // Prefetch A
+ PLD [r3, 64]
+ VMOV q10, q8
+ PLD [r12, 0]
+ PLD [r12, 64]
+ VMOV q11, q9
+ PLD [r10, 0]
+ PLD [r10, 64]
+ VMOV q12, q8
+ PLD [r7, 0]
+ PLD [r7, 64]
+ VMOV q13, q9
+ PLD [r9, 0] // Prefetch B
+ PLD [r9, 64]
+ VMOV q14, q8
+ PLD [r9, 128]
+ PLD [r9, 192]
+ VMOV q15, q9
+ PLD [r9, 256]
+ PLD [r9, 320]
+ BLO 5f // less than 4 channels?
+
+ // Prologue
+ VLD1.32 {d0}, [r3]! // A0
+ VLD1.32 {d1}, [r12]! // A1
+ VLD1.32 {d2}, [r10]! // A2
+ VLD1.32 {d3}, [r7]! // A3
+ SUBS r5, r5, 16
+ VLDM r9, {d8-d11} // B0
+ VLDR d15, [r9, 56] // B1CK 0
+ VLDR d13, [r9, 40] // B1
+ BLO 3f // less than 4 channels? skip main loop
+
+ # Main loop - 4 floats of A (16 bytes)
+ # 32 FMA + 8 LD64 A + 8 LDR B
+ .p2align 3
+2:
+ # First group of 16 FMA, Second group loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d0[0]
+ VLD1.32 {d4}, [r3]! // A0
+ VMLA.F32 q10, q4, d1[0]
+ VLD1.32 {d5}, [r12]! // A1
+ VMLA.F32 q12, q4, d2[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d3[0]
+ VLDR d12, [r9, 32] // B1
+ VMLA.F32 q9, q5, d0[0]
+ VLDR d9, [r9, 72] // B0
+ VMLA.F32 q11, q5, d1[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d2[0]
+ VLD1.32 {d6}, [r10]! // A2
+ VMLA.F32 q15, q5, d3[0]
+ VLD1.32 {d7}, [r7]! // A3
+ VMLA.F32 q8, q6, d0[1]
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d1[1]
+ VLDR d14, [r9, 48] // B1
+ VMLA.F32 q12, q6, d2[1]
+ VLDR d11, [r9, 88] // B0
+ VMLA.F32 q14, q6, d3[1]
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d0[1]
+ VLDR d8, [r9, 64] // B0
+ VMLA.F32 q11, q7, d1[1]
+ VLDR d13, [r9, 104] // B1
+ VMLA.F32 q13, q7, d2[1]
+ VLDR d10, [r9, 80] // B0
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d3[1]
+ VLDR d15, [r9, 120] // B1
+
+ # Second group of 16 FMA, First group of loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d4[0]
+ VLD1.32 {d0}, [r3]! // A0
+ VMLA.F32 q10, q4, d5[0]
+ VLD1.32 {d1}, [r12]! // A1
+ VMLA.F32 q12, q4, d6[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d7[0]
+ VLDR d12, [r9, 96] // B1
+ VMLA.F32 q9, q5, d4[0]
+ VLDR d9, [r9, 136] // B0
+ VMLA.F32 q11, q5, d5[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d6[0]
+ VLD1.32 {d2}, [r10]! // A2
+ VMLA.F32 q15, q5, d7[0]
+ VLD1.32 {d3}, [r7]! // A3
+ VMLA.F32 q8, q6, d4[1]
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d5[1]
+ VLDR d14, [r9, 112] // B1
+ VMLA.F32 q12, q6, d6[1]
+ VLDR d11, [r9, 152] // B0
+ VMLA.F32 q14, q6, d7[1]
+ SUBS r5, r5, 16
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d4[1]
+ VLDR d8, [r9, 128] // B0
+ VMLA.F32 q11, q7, d5[1]
+ VLDR d13, [r9, 168] // B1
+ VMLA.F32 q13, q7, d6[1]
+ VLDR d10, [r9, 144] // B0
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d7[1]
+ VLDR d15, [r9, 184] // B1
+ ADD r9, r9, 128 // B++
+ BHS 2b
+
+
+ # Epilogue - 4 floats of A (16 bytes)
+3:
+ # First group of 16 FMA, Second group loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d0[0]
+ VLD1.32 {d4}, [r3]! // A0
+ VMLA.F32 q10, q4, d1[0]
+ VLD1.32 {d5}, [r12]! // A1
+ VMLA.F32 q12, q4, d2[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d3[0]
+ VLDR d12, [r9, 32] // B1
+ VMLA.F32 q9, q5, d0[0]
+ VLDR d9, [r9, 72] // B0
+ VMLA.F32 q11, q5, d1[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d2[0]
+ VLD1.32 {d6}, [r10]! // A2
+ VMLA.F32 q15, q5, d3[0]
+ VLD1.32 {d7}, [r7]! // A3
+ VMLA.F32 q8, q6, d0[1]
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d1[1]
+ VLDR d14, [r9, 48] // B1
+ VMLA.F32 q12, q6, d2[1]
+ VLDR d11, [r9, 88] // B0
+ VMLA.F32 q14, q6, d3[1]
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d0[1]
+ VLDR d8, [r9, 64] // B0
+ VMLA.F32 q11, q7, d1[1]
+ VLDR d13, [r9, 104] // B1
+ VMLA.F32 q13, q7, d2[1]
+ VLDR d10, [r9, 80] // B0
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d3[1]
+ VLDR d15, [r9, 120] // B1
+
+ # Second group of 16 FMA, First group of loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d4[0]
+ VLDR d12, [r9, 96] // B1
+ VMLA.F32 q10, q4, d5[0]
+ VMLA.F32 q12, q4, d6[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d7[0]
+ VLDR d14, [r9, 112] // B1
+ VMLA.F32 q9, q5, d4[0]
+ VMLA.F32 q11, q5, d5[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d6[0]
+ VMLA.F32 q15, q5, d7[0]
+ VMLA.F32 q8, q6, d4[1]
+ ADD r9, r9, 128 // B++
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d5[1]
+ VMLA.F32 q12, q6, d6[1]
+ VMLA.F32 q14, q6, d7[1]
+ TST r5, 15
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d4[1]
+ VMLA.F32 q11, q7, d5[1]
+ VMLA.F32 q13, q7, d6[1]
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d7[1]
+
+ // Is there a remainder?- 1 to 3 floats of A (4, 8 or 12 bytes)
+ BNE 5f
+
+ .p2align 3
+4:
+ // Load params pointer
+ LDR r0, [sp, 116] // cn_stride
+ LDR r5, [sp, 120] // clamping_params
+ LDR r2, [sp, 0] // kc
+ SUBS r1, r1, 8
+
+ // Load clamping_params values
+ VLD1.32 {d4[],d5[]}, [r5]!
+ VLD1.32 {d6[],d7[]}, [r5]
+
+ // Clamp
+ VMIN.F32 q8, q8, q2
+ VMIN.F32 q9, q9, q2
+ VMIN.F32 q10, q10, q2
+ VMIN.F32 q11, q11, q2
+ VMIN.F32 q12, q12, q2
+ VMIN.F32 q13, q13, q2
+ VMIN.F32 q14, q14, q2
+ VMIN.F32 q15, q15, q2
+ VMAX.F32 q8, q8, q3
+ VMAX.F32 q9, q9, q3
+ VMAX.F32 q10, q10, q3
+ VMAX.F32 q11, q11, q3
+ VMAX.F32 q12, q12, q3
+ VMAX.F32 q13, q13, q3
+ VMAX.F32 q14, q14, q3
+ VMAX.F32 q15, q15, q3
+
+ // Store full 4 x 8
+ BLO 10f
+ VST1.32 {d16-d19}, [r11], r0
+ SUB r7, r7, r2
+ VST1.32 {d20-d23}, [r4], r0
+ SUB r10, r10, r2
+ VST1.32 {d24-d27}, [r8], r0
+ SUB r12, r12, r2
+ VST1.32 {d28-d31}, [r6], r0
+ SUB r3, r3, r2
+ BHI 1b
+
+ ADD sp, sp, 4
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ VPOP {d8-d15}
+ BX lr
+
+ .p2align 3
+5:
+ // Is there a remainder?- 2 floats of A (8 bytes)
+ TST r5, 8
+ BEQ 6f
+
+ // Remainder - 2 floats of A (8 bytes)
+ VLD1.32 {d0}, [r3]! // A0
+ VLDM r9!, {d8-d11} // B0
+ VLD1.32 {d1}, [r12]! // A1
+ VLD1.32 {d2}, [r10]! // A2
+ VLD1.32 {d3}, [ r7]! // A3
+
+ VMLA.F32 q8, q4, d0[0]
+ VMLA.F32 q9, q5, d0[0]
+ VMLA.F32 q10, q4, d1[0]
+ VMLA.F32 q11, q5, d1[0]
+ VLDM r9!, {d12-d15} // B1
+ VMLA.F32 q12, q4, d2[0]
+ VMLA.F32 q13, q5, d2[0]
+ VMLA.F32 q14, q4, d3[0]
+ VMLA.F32 q15, q5, d3[0]
+ VMLA.F32 q8, q6, d0[1]
+ VMLA.F32 q9, q7, d0[1]
+ VMLA.F32 q10, q6, d1[1]
+ VMLA.F32 q11, q7, d1[1]
+ VMLA.F32 q12, q6, d2[1]
+ VMLA.F32 q13, q7, d2[1]
+ VMLA.F32 q14, q6, d3[1]
+ VMLA.F32 q15, q7, d3[1]
+
+ // Is there a remainder?- 1 floats of A (4 bytes)
+ TST r5, 4
+ BEQ 4b
+
+6:
+ // Remainder- 1 floats of A (4 bytes)
+ VLDM r3!, {s0} // A0
+ VLDM r9!, {d8-d11} // B0
+ VLDM r12!, {s2} // A1
+ VLDM r10!, {s4} // A2
+ VLDM r7!, {s6} // A3
+ VMLA.F32 q8, q4, d0[0]
+ VMLA.F32 q9, q5, d0[0]
+ VMLA.F32 q10, q4, d1[0]
+ VMLA.F32 q11, q5, d1[0]
+ VMLA.F32 q12, q4, d2[0]
+ VMLA.F32 q13, q5, d2[0]
+ VMLA.F32 q14, q4, d3[0]
+ VMLA.F32 q15, q5, d3[0]
+ B 4b
+
+ // Store odd width
+10:
+ TST r1, 4
+ BEQ 11f
+ VST1.32 {d16-d17}, [r11]!
+ VMOV q8, q9
+ VST1.32 {d20-d21}, [r4]!
+ VMOV q10, q11
+ VST1.32 {d24-d25}, [r8]!
+ VMOV q12, q13
+ VST1.32 {d28-d29}, [r6]!
+ VMOV q14, q15
+
+11:
+ TST r1, 2
+ BEQ 12f
+ VST1.32 {d16}, [r11]!
+ VMOV d16, d17
+ VST1.32 {d20}, [r4]!
+ VMOV d20, d21
+ VST1.32 {d24}, [r8]!
+ VMOV d24, d25
+ VST1.32 {d28}, [r6]!
+ VMOV d28, d29
+
+12:
+ TST r1, 1
+ BEQ 13f
+ VST1.32 {d16[0]}, [r11]
+ VST1.32 {d20[0]}, [r4]
+ VST1.32 {d24[0]}, [r8]
+ VST1.32 {d28[0]}, [r6]
+
+13:
+ ADD sp, sp, 4
+ POP {r4, r5, r6, r7, r8, r9, r10, r11}
+ VPOP {d8-d15}
+ BX lr
+
+END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
\ No newline at end of file
diff --git a/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S b/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S
new file mode 100644
index 0000000..2c01d72
--- /dev/null
+++ b/src/f32-igemm/4x8-aarch32-neon-cortex-a55.S
@@ -0,0 +1,452 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55(
+// size_t mr, r0
+// size_t nc, r1
+// size_t kc, r2 -> r5 -> sp + 68
+// size_t ks, r3 -> sp + 72 -> r14
+// const float**restrict a, sp + 112 -> (r5)
+// const void*restrict w, sp + 116 -> r9
+// uint8_t*restrict c, sp + 120 -> r11
+// size_t cm_stride, sp + 124 -> (r6)
+// size_t cn_stride, sp + 128 -> (r0)
+// size_t a_offset, sp + 132 -> (r5)
+// const float* zero, sp + 136 -> (r0)
+// output_params*params, sp + 140 -> (r5)
+
+// inner loop registers
+// r0, r2 scratch temporaries for loads
+
+// A0 r3 d0
+// A1 r12 d1
+// A2 r10 d2
+// A3 r7 d3
+
+// B r9 d8, d9, d10, d11
+// B d12, d13, d14, d15
+
+// C0 r11 d16-d17 q8 d18-d19 q9
+// C1 r4 d20-d21 q10 d22-d23 q11
+// C2 r8 d24-d25 q12 d26-d27 q13
+// C3 r6 d28-d29 q14 d30-d31 q15
+
+// Clamp (r5) d4 d5 d6 d7
+
+BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ // Push 112 bytes
+ // r2 will be reloaded in outer loop. r3 is ks
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ SUB sp, sp, 4 // 4
+ VPUSH {d8-d15} // +64 = 112
+
+ LDR r11, [sp, 120] // c
+ LDR r6, [sp, 124] // cm_stride
+ LDR r5, [sp, 112] // a
+ LDR r9, [sp, 116] // w
+ MOV r14, r3 // p = ks
+
+ // Clamp C pointers
+ CMP r0, 2 // if mr >= 2
+ ADD r4, r11, r6 // c1 = c0 + cm_stride
+ MOVLO r4, r11 // c1
+ // if mr > 2
+ ADD r8, r4, r6 // c2 = c1 + cm_stride
+ MOVLS r8, r4 // c2
+ CMP r0, 4 // if mr >=4
+ ADD r6, r8, r6 // c3 = c2 + cm_stride
+ MOVLO r6, r8 // c3
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ VLDM r9!, {d16-d19} // Bias
+
+ VMOV q10, q8
+ VMOV q11, q9
+ VMOV q12, q8
+ VMOV q13, q9
+ PLD [r9, 0] // Prefetch B
+ PLD [r9, 64]
+ VMOV q14, q8
+ PLD [r9, 128]
+ PLD [r9, 192]
+ VMOV q15, q9
+ PLD [r9, 256]
+ PLD [r9, 320]
+
+1:
+ # Load next 4 A pointers
+ LDR r3, [r5, 0]
+ LDR r12, [r5, 4]
+ LDR r10, [r5, 8]
+ LDR r7, [r5, 12]
+ ADD r5, r5, 16
+ PLD [r3, 0] // Prefetch A
+ STR r5, [sp, 112] // a
+ PLD [r3, 64]
+ LDR r0, [sp, 136] // zero
+ PLD [r12, 0]
+ LDR r5, [sp, 132] // a_offset
+ PLD [r12, 64]
+ LDR r2, [sp, 68] // kc
+ PLD [r10, 0]
+ PLD [r10, 64]
+ PLD [r7, 0]
+ PLD [r7, 64]
+
+ // Add a_offset
+ CMP r3, r0 // if a0 == zero
+ ADD r3, r3, r5 // a0 += a_offset
+ MOVEQ r3, r0 // a0 = zero, else += a0 + a_offset
+ CMP r12, r0 // if a1 == zero
+ ADD r12, r12, r5 // a1 += a_offset
+ MOVEQ r12, r0 // a1 = zero, else += a1 + a_offset
+ CMP r10, r0 // if a2 == zero
+ ADD r10, r10, r5 // a2 += a_offset
+ MOVEQ r10, r0 // a2 = zero, else += a2 + a_offset
+ CMP r7, r0 // if a3 == zero
+ ADD r7, r7, r5 // a3 += a_offset
+ MOVEQ r7, r0 // a3 = zero, else += a3 + a_offset
+
+ SUBS r5, r2, 16 // kc - 16
+ BLO 5f // less than 4 channels?
+
+ // Prologue
+ VLD1.32 {d0}, [r3]! // A0
+ VLD1.32 {d1}, [r12]! // A1
+ VLD1.32 {d2}, [r10]! // A2
+ VLD1.32 {d3}, [r7]! // A3
+ SUBS r5, r5, 16
+ VLDM r9, {d8-d11} // B0
+ VLDR d15, [r9, 56] // B1CK 0
+ VLDR d13, [r9, 40] // B1
+
+ BLO 3f // less than 4 channels? skip main loop
+
+ # Main loop - 4 floats of A (16 bytes)
+ # 32 FMA + 8 LD64 A + 8 LDR B
+ .p2align 3
+2:
+ # First group of 16 FMA, Second group loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d0[0]
+ VLD1.32 {d4}, [r3]! // A0
+ VMLA.F32 q10, q4, d1[0]
+ VLD1.32 {d5}, [r12]! // A1
+ VMLA.F32 q12, q4, d2[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d3[0]
+ VLDR d12, [r9, 32] // B1
+ VMLA.F32 q9, q5, d0[0]
+ VLDR d9, [r9, 72] // B0
+ VMLA.F32 q11, q5, d1[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d2[0]
+ VLD1.32 {d6}, [r10]! // A2
+ VMLA.F32 q15, q5, d3[0]
+ VLD1.32 {d7}, [r7]! // A3
+ VMLA.F32 q8, q6, d0[1]
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d1[1]
+ VLDR d14, [r9, 48] // B1
+ VMLA.F32 q12, q6, d2[1]
+ VLDR d11, [r9, 88] // B0
+ VMLA.F32 q14, q6, d3[1]
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d0[1]
+ VLDR d8, [r9, 64] // B0
+ VMLA.F32 q11, q7, d1[1]
+ VLDR d13, [r9, 104] // B1
+ VMLA.F32 q13, q7, d2[1]
+ VLDR d10, [r9, 80] // B0
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d3[1]
+ VLDR d15, [r9, 120] // B1
+
+ # Second group of 16 FMA, First group of loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d4[0]
+ VLD1.32 {d0}, [r3]! // A0
+ VMLA.F32 q10, q4, d5[0]
+ VLD1.32 {d1}, [r12]! // A1
+ VMLA.F32 q12, q4, d6[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d7[0]
+ VLDR d12, [r9, 96] // B1
+ VMLA.F32 q9, q5, d4[0]
+ VLDR d9, [r9, 136] // B0
+ VMLA.F32 q11, q5, d5[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d6[0]
+ VLD1.32 {d2}, [r10]! // A2
+ VMLA.F32 q15, q5, d7[0]
+ VLD1.32 {d3}, [r7]! // A3
+ VMLA.F32 q8, q6, d4[1]
+ SUBS r5, r5, 16
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d5[1]
+ VLDR d14, [r9, 112] // B1
+ VMLA.F32 q12, q6, d6[1]
+ VLDR d11, [r9, 152] // B0
+ VMLA.F32 q14, q6, d7[1]
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d4[1]
+ VLDR d8, [r9, 128] // B0
+ VMLA.F32 q11, q7, d5[1]
+ VLDR d13, [r9, 168] // B1
+ VMLA.F32 q13, q7, d6[1]
+ VLDR d10, [r9, 144] // B0
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d7[1]
+ VLDR d15, [r9, 184] // B1
+ ADD r9, r9, 128 // B++
+ BHS 2b
+
+ # Epilogue - 4 floats of A (16 bytes)
+3:
+ # First group of 16 FMA, Second group loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d0[0]
+ VLD1.32 {d4}, [r3]! // A0
+ VMLA.F32 q10, q4, d1[0]
+ VLD1.32 {d5}, [r12]! // A1
+ VMLA.F32 q12, q4, d2[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d3[0]
+ VLDR d12, [r9, 32] // B1
+ VMLA.F32 q9, q5, d0[0]
+ VLDR d9, [r9, 72] // B0
+ VMLA.F32 q11, q5, d1[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d2[0]
+ VLD1.32 {d6}, [r10]! // A2
+ VMLA.F32 q15, q5, d3[0]
+ VLD1.32 {d7}, [r7]! // A3
+ VMLA.F32 q8, q6, d0[1]
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d1[1]
+ VLDR d14, [r9, 48] // B1
+ VMLA.F32 q12, q6, d2[1]
+ VLDR d11, [r9, 88] // B0
+ VMLA.F32 q14, q6, d3[1]
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d0[1]
+ VLDR d8, [r9, 64] // B0
+ VMLA.F32 q11, q7, d1[1]
+ VLDR d13, [r9, 104] // B1
+ VMLA.F32 q13, q7, d2[1]
+ VLDR d10, [r9, 80] // B0
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d3[1]
+ VLDR d15, [r9, 120] // B1
+
+ # Second group of 16 FMA, First group of loads
+ // BLOCK 0
+ VMLA.F32 q8, q4, d4[0]
+ VLDR d12, [r9, 96] // B1
+ VMLA.F32 q10, q4, d5[0]
+ VMLA.F32 q12, q4, d6[0]
+
+ // BLOCK 1
+ VMLA.F32 q14, q4, d7[0]
+ VLDR d14, [r9, 112] // B1
+ VMLA.F32 q9, q5, d4[0]
+ VMLA.F32 q11, q5, d5[0]
+
+ // BLOCK 2
+ VMLA.F32 q13, q5, d6[0]
+ VMLA.F32 q15, q5, d7[0]
+ VMLA.F32 q8, q6, d4[1]
+ ADD r9, r9, 128 // B++
+
+ // BLOCK 3
+ VMLA.F32 q10, q6, d5[1]
+ VMLA.F32 q12, q6, d6[1]
+ VMLA.F32 q14, q6, d7[1]
+ TST r5, 15
+
+ // BLOCK 4
+ VMLA.F32 q9, q7, d4[1]
+ VMLA.F32 q11, q7, d5[1]
+ VMLA.F32 q13, q7, d6[1]
+
+ // BLOCK 5
+ VMLA.F32 q15, q7, d7[1]
+
+ // Is there a remainder?- 1 to 3 floats of A (4, 8 or 12 bytes)
+ BNE 5f
+
+ .p2align 3
+4:
+ LDR r5, [sp, 112] // a
+ SUBS r14, r14, 16 // ks -= MR * sizeof(void*)
+
+ # ks loop
+ BHI 1b
+
+ // Load params pointer
+ LDR r0, [sp, 128] // cn_stride
+ LDR r2, [sp, 140] // clamping_params
+ LDR r14, [sp, 72] // p = ks
+ SUBS r1, r1, 8
+
+ // Load clamping_params values
+ VLD1.32 {d4[],d5[]}, [r2]!
+ VLD1.32 {d6[],d7[]}, [r2]
+
+ // Clamp
+ VMIN.F32 q8, q8, q2
+ VMIN.F32 q9, q9, q2
+ VMIN.F32 q10, q10, q2
+ VMIN.F32 q11, q11, q2
+ VMIN.F32 q12, q12, q2
+ VMIN.F32 q13, q13, q2
+ VMIN.F32 q14, q14, q2
+ VMIN.F32 q15, q15, q2
+ VMAX.F32 q8, q8, q3
+ VMAX.F32 q9, q9, q3
+ VMAX.F32 q10, q10, q3
+ VMAX.F32 q11, q11, q3
+ VMAX.F32 q12, q12, q3
+ VMAX.F32 q13, q13, q3
+ VMAX.F32 q14, q14, q3
+ VMAX.F32 q15, q15, q3
+
+ // Store full 4 x 8
+ BLO 10f
+ VST1.32 {d28-d31}, [r6], r0
+ VST1.32 {d24-d27}, [r8], r0
+ VST1.32 {d20-d23}, [r4], r0
+ VST1.32 {d16-d19}, [r11], r0
+
+ SUB r5, r5, r14 // a -= ks
+
+ BHI 0b
+
+ VPOP {d8-d15}
+ ADD sp, sp, 12 // skip pad, r2, r3
+ POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+ .p2align 3
+5:
+ // Is there a remainder?- 2 floats of A (8 bytes)
+ TST r5, 8
+ BEQ 6f
+
+ // Remainder - 2 floats of A (8 bytes)
+ VLD1.32 {d0}, [r3]! // A0
+ VLDM r9!, {d8-d11} // B0
+ VLD1.32 {d1}, [r12]! // A1
+ VLD1.32 {d2}, [r10]! // A2
+ VLD1.32 {d3}, [ r7]! // A3
+
+ VMLA.F32 q8, q4, d0[0]
+ VMLA.F32 q9, q5, d0[0]
+ VMLA.F32 q10, q4, d1[0]
+ VMLA.F32 q11, q5, d1[0]
+ VLDM r9!, {d12-d15} // B1
+ VMLA.F32 q12, q4, d2[0]
+ VMLA.F32 q13, q5, d2[0]
+ VMLA.F32 q14, q4, d3[0]
+ VMLA.F32 q15, q5, d3[0]
+ VMLA.F32 q8, q6, d0[1]
+ VMLA.F32 q9, q7, d0[1]
+ VMLA.F32 q10, q6, d1[1]
+ VMLA.F32 q11, q7, d1[1]
+ VMLA.F32 q12, q6, d2[1]
+ VMLA.F32 q13, q7, d2[1]
+ VMLA.F32 q14, q6, d3[1]
+ VMLA.F32 q15, q7, d3[1]
+
+ // Is there a remainder?- 1 floats of A (4 bytes)
+ TST r5, 4
+ BEQ 4b
+
+6:
+ // Remainder- 1 floats of A (4 bytes)
+ VLDM r3!, {s0} // A0
+ VLDM r9!, {d8-d11} // B0
+ VLDM r12!, {s2} // A1
+ VLDM r10!, {s4} // A2
+ VLDM r7!, {s6} // A3
+ VMLA.F32 q8, q4, d0[0]
+ VMLA.F32 q9, q5, d0[0]
+ VMLA.F32 q10, q4, d1[0]
+ VMLA.F32 q11, q5, d1[0]
+ VMLA.F32 q12, q4, d2[0]
+ VMLA.F32 q13, q5, d2[0]
+ VMLA.F32 q14, q4, d3[0]
+ VMLA.F32 q15, q5, d3[0]
+ B 4b
+
+ // Store odd width
+10:
+ TST r1, 4
+ BEQ 11f
+ VST1.32 {d28-d29}, [r6]!
+ VMOV q14, q15
+ VST1.32 {d24-d25}, [r8]!
+ VMOV q12, q13
+ VST1.32 {d20-d21}, [r4]!
+ VMOV q10, q11
+ VST1.32 {d16-d17}, [r11]!
+ VMOV q8, q9
+
+11:
+ TST r1, 2
+ BEQ 12f
+ VST1.32 {d28}, [r6]!
+ VMOV d28, d29
+ VST1.32 {d24}, [r8]!
+ VMOV d24, d25
+ VST1.32 {d20}, [r4]!
+ VMOV d20, d21
+ VST1.32 {d16}, [r11]!
+ VMOV d16, d17
+
+12:
+ TST r1, 1
+ BEQ 13f
+ VST1.32 {d28[0]}, [r6]!
+ VST1.32 {d24[0]}, [r8]!
+ VST1.32 {d20[0]}, [r4]!
+ VST1.32 {d16[0]}, [r11]!
+
+13:
+ VPOP {d8-d15}
+ ADD sp, sp, 12 // skip pad, r2, r3
+ POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
+
+END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
\ No newline at end of file
diff --git a/src/init.c b/src/init.c
index 5ba5efb..2fa8caf 100644
--- a/src/init.c
+++ b/src/init.c
@@ -147,7 +147,6 @@
break;
case cpuinfo_uarch_cortex_a53:
- case cpuinfo_uarch_cortex_a55:
xnn_params.f32.gemm = (struct gemm_parameters) {
.gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53,
.igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53,
@@ -158,6 +157,17 @@
};
break;
+ case cpuinfo_uarch_cortex_a55:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
+ .mr = 4,
+ .nr = 8,
+ };
+ break;
+
case cpuinfo_uarch_cortex_a57:
case cpuinfo_uarch_cortex_a72:
case cpuinfo_uarch_cortex_a73:
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index f33dbb1..ab31d3b 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -72,6 +72,7 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 3698aa7..a873c14 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -93,6 +93,7 @@
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__sse_load1)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__sse_load1)
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index 66c07a3..9383b50 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -8495,6 +8495,507 @@
#if XNN_ARCH_ARM
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .cn_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 12; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 12; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 12; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .cm_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+#endif // XNN_ARCH_ARM
+
+
+#if XNN_ARCH_ARM
TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index 4327996..8b73eee 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -68,6 +68,9 @@
- name: xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53
k-block: 4
pipelined: true
+- name: xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55
+ k-block: 4
+ pipelined: true
- name: xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75
k-block: 4
pipelined: true
diff --git a/test/f32-igemm.cc b/test/f32-igemm.cc
index a4d062b..a9acf29 100644
--- a/test/f32-igemm.cc
+++ b/test/f32-igemm.cc
@@ -4981,6 +4981,505 @@
#endif // XNN_ARCH_ARM
+#if XNN_ARCH_ARM
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .cn_stride(11)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 12; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 12; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+
+ TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(4)
+ .cm_stride(11)
+ .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
+ }
+#endif // XNN_ARCH_ARM
+
+
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
TEST_REQUIRES_ARM_NEON_FMA;
diff --git a/test/f32-igemm.yaml b/test/f32-igemm.yaml
index 68db6fd..2801f18 100644
--- a/test/f32-igemm.yaml
+++ b/test/f32-igemm.yaml
@@ -38,6 +38,9 @@
- name: xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53
k-block: 4
pipelined: true
+- name: xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55
+ k-block: 4
+ pipelined: true
- name: xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57
k-block: 8
pipelined: true