iOS 6x8 microkernel based on Cortex-A75 but with X18 avoided.
Use x14 for C pointer and reload cn_stride during clamping into x0.
PiperOrigin-RevId: 302555155
diff --git a/BUILD.bazel b/BUILD.bazel
index fc0ea18..9bc3ce1 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1439,6 +1439,7 @@
"src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S",
"src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S",
"src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S",
+ "src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S",
"src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S",
"src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S",
"src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S",
@@ -1459,6 +1460,7 @@
"src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S",
"src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S",
"src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S",
+ "src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S",
"src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S",
"src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S",
"src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S",
@@ -1477,6 +1479,7 @@
"src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S",
"src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S",
"src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S",
+ "src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S",
]
INTERNAL_MICROKERNEL_HDRS = [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9bcaa58..372d2f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1429,6 +1429,7 @@
src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a73.S
src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a57.S
src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+ src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
src/f32-gemm/gen/6x8-aarch64-neonfma-ld128.S
src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
src/f32-gemm/gen-inc/1x12-aarch64-neonfma-cortex-a53.S
@@ -1449,6 +1450,7 @@
src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a73.S
src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a57.S
src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
+ src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld128.S
src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
src/f32-igemm/1x12-aarch64-neonfma-cortex-a53.S
@@ -1466,7 +1468,8 @@
src/f32-igemm/6x8-aarch64-neonfma-cortex-a55.S
src/f32-igemm/6x8-aarch64-neonfma-cortex-a73.S
src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S
- src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S)
+ src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+ src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S)
SET(XNNPACK_MICROKERNEL_SRCS ${XNNPACK_SCALAR_MICROKERNEL_SRCS})
IF(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
diff --git a/bench/f32-gemm-e2e.cc b/bench/f32-gemm-e2e.cc
index 6effdbb..f248514 100644
--- a/bench/f32-gemm-e2e.cc
+++ b/bench/f32-gemm-e2e.cc
@@ -198,6 +198,15 @@
6 /* mr */, 8 /* nr */);
}
+ static void f32_gemm_6x8__aarch64_neonfma_ios(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios,
+ xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios,
+ xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ 6 /* mr */, 8 /* nr */);
+ }
+
static void f32_gemm_6x8__aarch64_neonfma_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64,
@@ -267,6 +276,7 @@
BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a73);
BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a57);
BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_cortex_a75);
+ BENCHMARK_END2END(f32_gemm_6x8__aarch64_neonfma_ios);
BENCHMARK_END2END(f32_gemm_4x12__aarch64_neonfma_cortex_a53)
BENCHMARK_END2END(f32_gemm_4x8__neonfma_lane_ld64);
diff --git a/bench/f32-gemm.cc b/bench/f32-gemm.cc
index fcea12f..0496770 100644
--- a/bench/f32-gemm.cc
+++ b/bench/f32-gemm.cc
@@ -408,6 +408,9 @@
static void f32_gemm_6x8__aarch64_neonfma_cortex_a75(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
}
+ static void f32_gemm_6x8__aarch64_neonfma_ios(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios, 6, 8, 1, 1);
+ }
static void f32_gemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
}
@@ -444,6 +447,7 @@
BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a73)
BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a57)
BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_cortex_a75)
+ BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ios)
BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld64)
BENCHMARK_GEMM(f32_gemm_6x8__aarch64_neonfma_ld128)
BENCHMARK_GEMM(f32_gemm_1x8__neonfma_lane_ld64)
diff --git a/bench/f32-igemm.cc b/bench/f32-igemm.cc
index 408073e..67f1142 100644
--- a/bench/f32-igemm.cc
+++ b/bench/f32-igemm.cc
@@ -347,6 +347,10 @@
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75, 6, 8, 1, 1);
}
+ static void f32_igemm_6x8__aarch64_neonfma_ios(benchmark::State& state, const char* net) {
+ IGEMMBenchmark(state, xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios, 6, 8, 1, 1);
+ }
+
static void f32_igemm_1x8__neonfma_lane_ld64(benchmark::State& state, const char* net) {
IGEMMBenchmark(state, xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64, 1, 8, 1, 1);
}
@@ -391,6 +395,7 @@
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a73)
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a57)
BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_cortex_a75)
+ BENCHMARK_CONV(f32_igemm_6x8__aarch64_neonfma_ios)
BENCHMARK_CONV(f32_igemm_1x8__neonfma_lane_ld64)
BENCHMARK_CONV(f32_igemm_4x2__neonfma_lane_ld64)
BENCHMARK_CONV(f32_igemm_4x4__neonfma_lane_ld64)
diff --git a/scripts/generate-f32-gemm.sh b/scripts/generate-f32-gemm.sh
index ff15722..34ab0f0 100755
--- a/scripts/generate-f32-gemm.sh
+++ b/scripts/generate-f32-gemm.sh
@@ -84,6 +84,9 @@
tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in -D INC=0 -D PREFETCH=1 -o src/f32-gemm/gen/6x8-aarch64-neonfma-cortex-a75.S
tools/xngen src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in -D INC=1 -D PREFETCH=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ios.S.in -D INC=0 -D PREFETCH=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
+tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ios.S.in -D INC=1 -D PREFETCH=0 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
+
tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in -D INC=0 -o src/f32-gemm/gen/6x8-aarch64-neonfma-ld64.S
tools/xngen src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in -D INC=1 -o src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ld64.S
diff --git a/scripts/generate-f32-igemm.sh b/scripts/generate-f32-igemm.sh
index ceed443..ff03fea 100755
--- a/scripts/generate-f32-igemm.sh
+++ b/scripts/generate-f32-igemm.sh
@@ -27,6 +27,7 @@
tools/xngen src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in -D INC=0 -D PREFETCH=1 -o src/f32-igemm/gen/5x8-aarch64-neonfma-cortex-a75.S
tools/xngen src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in -D INC=0 -D PREFETCH=0 -o src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a57.S
tools/xngen src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in -D INC=0 -D PREFETCH=1 -o src/f32-igemm/gen/6x8-aarch64-neonfma-cortex-a75.S
+tools/xngen src/f32-igemm/6x8-aarch64-neonfma-ios.S.in -D INC=0 -D PREFETCH=0 -o src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S
############################### AArch32 assembly ##############################
tools/xngen src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in -D INC=0 -D PREFETCH=0 -o src/f32-igemm/gen/4x8-aarch32-neon-cortex-a75.S
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
new file mode 100644
index 0000000..07f9156
--- /dev/null
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
@@ -0,0 +1,758 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ios(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const uint8_t*restrict a, x3
+# size_t a_stride, x4
+# const void*restrict w, x5
+# uint8_t*restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+$if INC:
+ # const float*restrict acc, [sp + 8] -> x15
+ # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
+$else:
+ # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
+
+# d8-d15 need to be preserved if used.
+# x19-30 need to be preserved if used.
+
+# A pointers
+# x3 a0
+# x9 a1
+# x10 a2
+# x11 a3
+# x12 a4
+# x4 a5
+
+# C pointers
+# x6 c0
+# x16 c1
+# x17 c2
+# x14 c3
+# x13 c4
+# x7 c5
+
+# Vector register usage
+# A0 v0 v6
+# A1 v1 v7
+# A2 v2 v8
+# A3 v3 v9
+# A4 v4 v10
+# A5 v5 v11
+# B v12 v13 v14 v15
+# B v16 v17 v18 v19
+# C v20 v21
+# C v22 v23
+# C v24 v25
+# C v26 v27
+# C v28 v29
+# C v30 v31
+# Clamp v6 v7
+
+# IOS microkernel is based on Cortex-A75 kernel but avoids X18 by
+# using X14 instead of X18, and reloading cn_stride into x0.
+
+BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ios
+
+ # Clamp A and C pointers / Save d8-d15 on stack
+ STP d8, d9, [sp, -64]!
+ CMP x0, 2 // if mr < 2
+ ADD x9, x3, x4 // a1 = a0 + a_stride
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x9, x3, x9, LO // a1 = a0
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ STP d10, d11, [sp, 16]
+ ADD x10, x9, x4 // a2 = a1 + a_stride
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x10, x9, x10, LS // a2 = a1
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ STP d12, d13, [sp, 32]
+ CMP x0, 4 // if mr < 4
+ ADD x11, x10, x4 // a3 = a2 + a_stride
+ ADD x14, x17, x7 // c3 = c2 + cm_stride
+ CSEL x11, x10, x11, LO // a3 = a2
+ CSEL x14, x17, x14, LO // c3 = c2
+
+ STP d14, d15, [sp, 48]
+ ADD x12, x11, x4 // a4 = a3 + a_stride
+ ADD x13, x14, x7 // c4 = c3 + cm_stride
+ // if mr <= 5
+ CSEL x12, x11, x12, LS // a4 = a3
+ CSEL x13, x14, x13, LS // c4 = c3
+
+ $if INC:
+ # Load acc, params pointer
+ LDP x15, x8, [sp, 72]
+ $else:
+ # Load params pointer
+ LDR x8, [sp, 72]
+
+ CMP x0, 6 // if mr < 6
+ ADD x4, x12, x4 // a5 = a4 + a_stride
+ ADD x7, x13, x7 // c5 = c4 + cm_stride
+ CSEL x4, x12, x4, LO // a5 = a4
+ CSEL x7, x13, x7, LO // c5 = c4
+
+0:
+ $if INC:
+ # Load initial accumulators
+ LDP q20, q21, [x15], 32
+ LDP q22, q23, [x15], 32
+ LDP q24, q25, [x15], 32
+ LDP q26, q27, [x15], 32
+ LDP q28, q29, [x15], 32
+ LDP q30, q31, [x15], 32
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ PRFM PLDL1KEEP, [x5, 64]
+ PRFM PLDL1KEEP, [x5, 128]
+ PRFM PLDL1KEEP, [x5, 192]
+ PRFM PLDL1KEEP, [x3] // Prefetch A
+ PRFM PLDL1KEEP, [x9]
+ PRFM PLDL1KEEP, [x10]
+ PRFM PLDL1KEEP, [x11]
+ PRFM PLDL1KEEP, [x12]
+ PRFM PLDL1KEEP, [x4]
+ $else:
+ # Load initial bias from w into accumulators
+ LDP q20, q21, [x5], 32
+ MOV v22.16b, v20.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ MOV v23.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 64]
+ MOV v24.16b, v20.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 128]
+ MOV v25.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 192]
+ MOV v26.16b, v20.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x3] // Prefetch A
+ MOV v27.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x9]
+ MOV v28.16b, v20.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x10]
+ MOV v29.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x11]
+ MOV v30.16b, v20.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x12]
+ MOV v31.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x4]
+
+ # Is there at least 8 floats (32 bytes) for prologue + epilogue?
+ SUBS x0, x2, 32 // k = kc - 32
+ B.LO 4f
+
+ # Prologue - loads for main loop of 96 FMA
+ LDR q0, [x3], 16
+ LDR q1, [x9], 16
+ LDR q2, [x10], 16
+ LDR q3, [x11], 16
+ LDR q4, [x12], 16
+ LDR q5, [x4], 16
+ LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+
+ # Is there at least 8 floats (32 bytes) for main loop?
+ SUBS x0, x0, 32
+ B.LO 2f
+
+ # Main loop - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+1:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 128] // Prefetch B
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 256]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x3], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x9], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x10], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x11], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x12], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x4], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ LDR q0, [x3], 16 // Load next 6 A
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ LDR q1, [x9], 16
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ LDR q2, [x10], 16
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+ LDR q3, [x11], 16
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ LDR q4, [x12], 16
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ LDR q5, [x4], 16
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ LDP q12, q13, [x5], 32 // Load next 3 B (not last)
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+ LDP q16, q17, [x5], 32
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ SUBS x0, x0, 32
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.HS 1b
+
+ # Epilogue - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+ # First block same as main loop. Second block has no preloads.
+2:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 128] // Prefetch B
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 256]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x3], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x9], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x10], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x11], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x12], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x4], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ # Is there a remainder?- 4 floats of A (16 bytes) or less
+ TST x0, 31
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.NE 4f
+
+ # Clamp
+3:
+ FMIN v20.4s, v20.4s, v6.4s
+ SUBS x1, x1, 8
+ FMIN v21.4s, v21.4s, v6.4s
+ FMIN v22.4s, v22.4s, v6.4s
+ FMIN v23.4s, v23.4s, v6.4s
+ FMIN v24.4s, v24.4s, v6.4s
+ FMIN v25.4s, v25.4s, v6.4s
+ FMIN v26.4s, v26.4s, v6.4s
+ FMIN v27.4s, v27.4s, v6.4s
+ FMIN v28.4s, v28.4s, v6.4s
+ FMIN v29.4s, v29.4s, v6.4s
+ FMIN v30.4s, v30.4s, v6.4s
+ FMIN v31.4s, v31.4s, v6.4s
+ # Load cn_stride
+ LDR x0, [sp, 64]
+ FMAX v20.4s, v20.4s, v7.4s
+ FMAX v21.4s, v21.4s, v7.4s
+ FMAX v22.4s, v22.4s, v7.4s
+ FMAX v23.4s, v23.4s, v7.4s
+ FMAX v24.4s, v24.4s, v7.4s
+ FMAX v25.4s, v25.4s, v7.4s
+ FMAX v26.4s, v26.4s, v7.4s
+ FMAX v27.4s, v27.4s, v7.4s
+ FMAX v28.4s, v28.4s, v7.4s
+ FMAX v29.4s, v29.4s, v7.4s
+ FMAX v30.4s, v30.4s, v7.4s
+ FMAX v31.4s, v31.4s, v7.4s
+
+ # Store full 6 x 8
+ B.LO 7f
+
+ $if INC:
+ STP q30, q31, [x7]
+ ADD x7, x7, x0
+ SUB x3, x3, x2 // a0 -= kc
+ STP q28, q29, [x13]
+ ADD x13, x13, x0
+ SUB x9, x9, x2 // a1 -= kc
+ STP q26, q27, [x14]
+ ADD x14, x14, x0
+ SUB x10, x10, x2 // a2 -= kc
+ STP q24, q25, [x17]
+ ADD x17, x17, x0
+ SUB x11, x11, x2 // a3 -= kc
+ STP q22, q23, [x16]
+ ADD x16, x16, x0
+ SUB x12, x12, x2 // a4 -= kc
+ STP q20, q21, [x6]
+ ADD x6, x6, x0
+ SUB x4, x4, x2 // a5 -= kc
+ $else:
+ STP q20, q21, [x6]
+ ADD x6, x6, x0
+ SUB x3, x3, x2 // a0 -= kc
+ STP q22, q23, [x16]
+ ADD x16, x16, x0
+ SUB x9, x9, x2 // a1 -= kc
+ STP q24, q25, [x17]
+ ADD x17, x17, x0
+ SUB x10, x10, x2 // a2 -= kc
+ STP q26, q27, [x14]
+ ADD x14, x14, x0
+ SUB x11, x11, x2 // a3 -= kc
+ STP q28, q29, [x13]
+ ADD x13, x13, x0
+ SUB x12, x12, x2 // a4 -= kc
+ STP q30, q31, [x7]
+ ADD x7, x7, x0
+ SUB x4, x4, x2 // a5 -= kc
+
+ B.HI 0b
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+4:
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ # Is there a remainder?- 4 floats of A (16 bytes)
+ TBZ x0, 4, 5f
+
+ # Remainder- 4 floats of A (16 bytes)
+ # Load A
+ LDR q0, [x3], 16
+ LDR q1, [x9], 16
+ LDR q2, [x10], 16
+ LDR q3, [x11], 16
+ LDR q4, [x12], 16
+ LDR q5, [x4], 16
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+ LDP q18, q19, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+
+ # Is there a remainder?- 2 floats of A (8 bytes)
+5:
+ TBZ x0, 3, 6f
+
+ # Remainder- 2 floats of A (8 bytes)
+ # Load A
+ LDR d0, [x3], 8
+ LDR d1, [x9], 8
+ LDR d2, [x10], 8
+ LDR d3, [x11], 8
+ LDR d4, [x12], 8
+ LDR d5, [x4], 8
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ # Is there a remainder?- 1 float of A (4 bytes)
+6:
+ TBZ x0, 2, 3b
+
+ # Remainder- 1 float of A (4 bytes)
+ # Load A
+ LDR s0, [x3], 4
+ LDR s1, [x9], 4
+ LDR s2, [x10], 4
+ LDR s3, [x11], 4
+ LDR s4, [x12], 4
+ LDR s5, [x4], 4
+ # Load B
+ LDP q12, q13, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+ B 3b
+
+ # Store odd width
+7:
+ TBZ x1, 2, 8f
+ $if INC:
+ STR q30, [x7], 16
+ MOV v30.16b, v31.16b
+ STR q28, [x13], 16
+ MOV v28.16b, v29.16b
+ STR q26, [x14], 16
+ MOV v26.16b, v27.16b
+ STR q24, [x17], 16
+ MOV v24.16b, v25.16b
+ STR q22, [x16], 16
+ MOV v22.16b, v23.16b
+ STR q20, [x6], 16
+ MOV v20.16b, v21.16b
+ $else:
+ STR q20, [x6], 16
+ MOV v20.16b, v21.16b
+ STR q22, [x16], 16
+ MOV v22.16b, v23.16b
+ STR q24, [x17], 16
+ MOV v24.16b, v25.16b
+ STR q26, [x14], 16
+ MOV v26.16b, v27.16b
+ STR q28, [x13], 16
+ MOV v28.16b, v29.16b
+ STR q30, [x7], 16
+ MOV v30.16b, v31.16b
+8:
+ TBZ x1, 1, 9f
+ $if INC:
+ STR d30, [x7], 8
+ DUP d30, v30.d[1]
+ STR d28, [x13], 8
+ DUP d28, v28.d[1]
+ STR d26, [x14], 8
+ DUP d26, v26.d[1]
+ STR d24, [x17], 8
+ DUP d24, v24.d[1]
+ STR d22, [x16], 8
+ DUP d22, v22.d[1]
+ STR d20, [x6], 8
+ DUP d20, v20.d[1]
+ $else:
+ STR d20, [x6], 8
+ DUP d20, v20.d[1]
+ STR d22, [x16], 8
+ DUP d22, v22.d[1]
+ STR d24, [x17], 8
+ DUP d24, v24.d[1]
+ STR d26, [x14], 8
+ DUP d26, v26.d[1]
+ STR d28, [x13], 8
+ DUP d28, v28.d[1]
+ STR d30, [x7], 8
+ DUP d30, v30.d[1]
+
+9:
+ TBZ x1, 0, 10f
+ $if INC:
+ STR s30, [x7]
+ STR s28, [x13]
+ STR s26, [x14]
+ STR s24, [x17]
+ STR s22, [x16]
+ STR s20, [x6]
+ $else:
+ STR s20, [x6]
+ STR s22, [x16]
+ STR s24, [x17]
+ STR s26, [x14]
+ STR s28, [x13]
+ STR s30, [x7]
+10:
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ios
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
new file mode 100644
index 0000000..9a7eab7
--- /dev/null
+++ b/src/f32-gemm/gen-inc/6x8-aarch64-neonfma-ios.S
@@ -0,0 +1,646 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const uint8_t*restrict a, x3
+# size_t a_stride, x4
+# const void*restrict w, x5
+# uint8_t*restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# const float*restrict acc, [sp + 8] -> x15
+# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8
+
+# d8-d15 need to be preserved if used.
+# x19-30 need to be preserved if used.
+
+# A pointers
+# x3 a0
+# x9 a1
+# x10 a2
+# x11 a3
+# x12 a4
+# x4 a5
+
+# C pointers
+# x6 c0
+# x16 c1
+# x17 c2
+# x14 c3
+# x13 c4
+# x7 c5
+
+# Vector register usage
+# A0 v0 v6
+# A1 v1 v7
+# A2 v2 v8
+# A3 v3 v9
+# A4 v4 v10
+# A5 v5 v11
+# B v12 v13 v14 v15
+# B v16 v17 v18 v19
+# C v20 v21
+# C v22 v23
+# C v24 v25
+# C v26 v27
+# C v28 v29
+# C v30 v31
+# Clamp v6 v7
+
+# IOS microkernel is based on Cortex-A75 kernel but avoids X18 by
+# using X14 instead of X18, and reloading cn_stride into x0.
+
+BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios
+
+ # Clamp A and C pointers / Save d8-d15 on stack
+ STP d8, d9, [sp, -64]!
+ CMP x0, 2 // if mr < 2
+ ADD x9, x3, x4 // a1 = a0 + a_stride
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x9, x3, x9, LO // a1 = a0
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ STP d10, d11, [sp, 16]
+ ADD x10, x9, x4 // a2 = a1 + a_stride
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x10, x9, x10, LS // a2 = a1
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ STP d12, d13, [sp, 32]
+ CMP x0, 4 // if mr < 4
+ ADD x11, x10, x4 // a3 = a2 + a_stride
+ ADD x14, x17, x7 // c3 = c2 + cm_stride
+ CSEL x11, x10, x11, LO // a3 = a2
+ CSEL x14, x17, x14, LO // c3 = c2
+
+ STP d14, d15, [sp, 48]
+ ADD x12, x11, x4 // a4 = a3 + a_stride
+ ADD x13, x14, x7 // c4 = c3 + cm_stride
+ // if mr <= 5
+ CSEL x12, x11, x12, LS // a4 = a3
+ CSEL x13, x14, x13, LS // c4 = c3
+
+ # Load acc, params pointer
+ LDP x15, x8, [sp, 72]
+
+ CMP x0, 6 // if mr < 6
+ ADD x4, x12, x4 // a5 = a4 + a_stride
+ ADD x7, x13, x7 // c5 = c4 + cm_stride
+ CSEL x4, x12, x4, LO // a5 = a4
+ CSEL x7, x13, x7, LO // c5 = c4
+
+0:
+ # Load initial accumulators
+ LDP q20, q21, [x15], 32
+ LDP q22, q23, [x15], 32
+ LDP q24, q25, [x15], 32
+ LDP q26, q27, [x15], 32
+ LDP q28, q29, [x15], 32
+ LDP q30, q31, [x15], 32
+
+ # Is there at least 8 floats (32 bytes) for prologue + epilogue?
+ SUBS x0, x2, 32 // k = kc - 32
+ B.LO 4f
+
+ # Prologue - loads for main loop of 96 FMA
+ LDR q0, [x3], 16
+ LDR q1, [x9], 16
+ LDR q2, [x10], 16
+ LDR q3, [x11], 16
+ LDR q4, [x12], 16
+ LDR q5, [x4], 16
+ LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+
+ # Is there at least 8 floats (32 bytes) for main loop?
+ SUBS x0, x0, 32
+ B.LO 2f
+
+ # Main loop - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+1:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x3], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x9], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x10], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x11], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x12], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x4], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ LDR q0, [x3], 16 // Load next 6 A
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ LDR q1, [x9], 16
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ LDR q2, [x10], 16
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+ LDR q3, [x11], 16
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ LDR q4, [x12], 16
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ LDR q5, [x4], 16
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ LDP q12, q13, [x5], 32 // Load next 3 B (not last)
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+ LDP q16, q17, [x5], 32
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ SUBS x0, x0, 32
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.HS 1b
+
+ # Epilogue - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+ # First block same as main loop. Second block has no preloads.
+2:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x3], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x9], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x10], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x11], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x12], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x4], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ # Is there a remainder?- 4 floats of A (16 bytes) or less
+ TST x0, 31
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.NE 4f
+
+ # Clamp
+3:
+ FMIN v20.4s, v20.4s, v6.4s
+ SUBS x1, x1, 8
+ FMIN v21.4s, v21.4s, v6.4s
+ FMIN v22.4s, v22.4s, v6.4s
+ FMIN v23.4s, v23.4s, v6.4s
+ FMIN v24.4s, v24.4s, v6.4s
+ FMIN v25.4s, v25.4s, v6.4s
+ FMIN v26.4s, v26.4s, v6.4s
+ FMIN v27.4s, v27.4s, v6.4s
+ FMIN v28.4s, v28.4s, v6.4s
+ FMIN v29.4s, v29.4s, v6.4s
+ FMIN v30.4s, v30.4s, v6.4s
+ FMIN v31.4s, v31.4s, v6.4s
+ # Load cn_stride
+ LDR x0, [sp, 64]
+ FMAX v20.4s, v20.4s, v7.4s
+ FMAX v21.4s, v21.4s, v7.4s
+ FMAX v22.4s, v22.4s, v7.4s
+ FMAX v23.4s, v23.4s, v7.4s
+ FMAX v24.4s, v24.4s, v7.4s
+ FMAX v25.4s, v25.4s, v7.4s
+ FMAX v26.4s, v26.4s, v7.4s
+ FMAX v27.4s, v27.4s, v7.4s
+ FMAX v28.4s, v28.4s, v7.4s
+ FMAX v29.4s, v29.4s, v7.4s
+ FMAX v30.4s, v30.4s, v7.4s
+ FMAX v31.4s, v31.4s, v7.4s
+
+ # Store full 6 x 8
+ B.LO 7f
+
+ STP q30, q31, [x7]
+ ADD x7, x7, x0
+ SUB x3, x3, x2 // a0 -= kc
+ STP q28, q29, [x13]
+ ADD x13, x13, x0
+ SUB x9, x9, x2 // a1 -= kc
+ STP q26, q27, [x14]
+ ADD x14, x14, x0
+ SUB x10, x10, x2 // a2 -= kc
+ STP q24, q25, [x17]
+ ADD x17, x17, x0
+ SUB x11, x11, x2 // a3 -= kc
+ STP q22, q23, [x16]
+ ADD x16, x16, x0
+ SUB x12, x12, x2 // a4 -= kc
+ STP q20, q21, [x6]
+ ADD x6, x6, x0
+ SUB x4, x4, x2 // a5 -= kc
+
+ B.HI 0b
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+4:
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ # Is there a remainder?- 4 floats of A (16 bytes)
+ TBZ x0, 4, 5f
+
+ # Remainder- 4 floats of A (16 bytes)
+ # Load A
+ LDR q0, [x3], 16
+ LDR q1, [x9], 16
+ LDR q2, [x10], 16
+ LDR q3, [x11], 16
+ LDR q4, [x12], 16
+ LDR q5, [x4], 16
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+ LDP q18, q19, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+
+ # Is there a remainder?- 2 floats of A (8 bytes)
+5:
+ TBZ x0, 3, 6f
+
+ # Remainder- 2 floats of A (8 bytes)
+ # Load A
+ LDR d0, [x3], 8
+ LDR d1, [x9], 8
+ LDR d2, [x10], 8
+ LDR d3, [x11], 8
+ LDR d4, [x12], 8
+ LDR d5, [x4], 8
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ # Is there a remainder?- 1 float of A (4 bytes)
+6:
+ TBZ x0, 2, 3b
+
+ # Remainder- 1 float of A (4 bytes)
+ # Load A
+ LDR s0, [x3], 4
+ LDR s1, [x9], 4
+ LDR s2, [x10], 4
+ LDR s3, [x11], 4
+ LDR s4, [x12], 4
+ LDR s5, [x4], 4
+ # Load B
+ LDP q12, q13, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+ B 3b
+
+ # Store odd width
+7:
+ TBZ x1, 2, 8f
+ STR q30, [x7], 16
+ MOV v30.16b, v31.16b
+ STR q28, [x13], 16
+ MOV v28.16b, v29.16b
+ STR q26, [x14], 16
+ MOV v26.16b, v27.16b
+ STR q24, [x17], 16
+ MOV v24.16b, v25.16b
+ STR q22, [x16], 16
+ MOV v22.16b, v23.16b
+ STR q20, [x6], 16
+ MOV v20.16b, v21.16b
+8:
+ TBZ x1, 1, 9f
+ STR d30, [x7], 8
+ DUP d30, v30.d[1]
+ STR d28, [x13], 8
+ DUP d28, v28.d[1]
+ STR d26, [x14], 8
+ DUP d26, v26.d[1]
+ STR d24, [x17], 8
+ DUP d24, v24.d[1]
+ STR d22, [x16], 8
+ DUP d22, v22.d[1]
+ STR d20, [x6], 8
+ DUP d20, v20.d[1]
+
+9:
+ TBZ x1, 0, 10f
+ STR s30, [x7]
+ STR s28, [x13]
+ STR s26, [x14]
+ STR s24, [x17]
+ STR s22, [x16]
+ STR s20, [x6]
+10:
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S b/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
new file mode 100644
index 0000000..aa7b950
--- /dev/null
+++ b/src/f32-gemm/gen/6x8-aarch64-neonfma-ios.S
@@ -0,0 +1,650 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-gemm/6x8-aarch64-neonfma-ios.S.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const uint8_t*restrict a, x3
+# size_t a_stride, x4
+# const void*restrict w, x5
+# uint8_t*restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8
+
+# d8-d15 need to be preserved if used.
+# x19-30 need to be preserved if used.
+
+# A pointers
+# x3 a0
+# x9 a1
+# x10 a2
+# x11 a3
+# x12 a4
+# x4 a5
+
+# C pointers
+# x6 c0
+# x16 c1
+# x17 c2
+# x14 c3
+# x13 c4
+# x7 c5
+
+# Vector register usage
+# A0 v0 v6
+# A1 v1 v7
+# A2 v2 v8
+# A3 v3 v9
+# A4 v4 v10
+# A5 v5 v11
+# B v12 v13 v14 v15
+# B v16 v17 v18 v19
+# C v20 v21
+# C v22 v23
+# C v24 v25
+# C v26 v27
+# C v28 v29
+# C v30 v31
+# Clamp v6 v7
+
+# IOS microkernel is based on Cortex-A75 kernel but avoids X18 by
+# using X14 instead of X18, and reloading cn_stride into x0.
+
+BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios
+
+ # Clamp A and C pointers / Save d8-d15 on stack
+ STP d8, d9, [sp, -64]!
+ CMP x0, 2 // if mr < 2
+ ADD x9, x3, x4 // a1 = a0 + a_stride
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x9, x3, x9, LO // a1 = a0
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ STP d10, d11, [sp, 16]
+ ADD x10, x9, x4 // a2 = a1 + a_stride
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x10, x9, x10, LS // a2 = a1
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ STP d12, d13, [sp, 32]
+ CMP x0, 4 // if mr < 4
+ ADD x11, x10, x4 // a3 = a2 + a_stride
+ ADD x14, x17, x7 // c3 = c2 + cm_stride
+ CSEL x11, x10, x11, LO // a3 = a2
+ CSEL x14, x17, x14, LO // c3 = c2
+
+ STP d14, d15, [sp, 48]
+ ADD x12, x11, x4 // a4 = a3 + a_stride
+ ADD x13, x14, x7 // c4 = c3 + cm_stride
+ // if mr <= 5
+ CSEL x12, x11, x12, LS // a4 = a3
+ CSEL x13, x14, x13, LS // c4 = c3
+
+ # Load params pointer
+ LDR x8, [sp, 72]
+
+ CMP x0, 6 // if mr < 6
+ ADD x4, x12, x4 // a5 = a4 + a_stride
+ ADD x7, x13, x7 // c5 = c4 + cm_stride
+ CSEL x4, x12, x4, LO // a5 = a4
+ CSEL x7, x13, x7, LO // c5 = c4
+
+0:
+ # Load initial bias from w into accumulators
+ LDP q20, q21, [x5], 32
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v21.16b
+ MOV v24.16b, v20.16b
+ MOV v25.16b, v21.16b
+ MOV v26.16b, v20.16b
+ MOV v27.16b, v21.16b
+ MOV v28.16b, v20.16b
+ MOV v29.16b, v21.16b
+ MOV v30.16b, v20.16b
+ MOV v31.16b, v21.16b
+
+ # Is there at least 8 floats (32 bytes) for prologue + epilogue?
+ SUBS x0, x2, 32 // k = kc - 32
+ B.LO 4f
+
+ # Prologue - loads for main loop of 96 FMA
+ LDR q0, [x3], 16
+ LDR q1, [x9], 16
+ LDR q2, [x10], 16
+ LDR q3, [x11], 16
+ LDR q4, [x12], 16
+ LDR q5, [x4], 16
+ LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+
+ # Is there at least 8 floats (32 bytes) for main loop?
+ SUBS x0, x0, 32
+ B.LO 2f
+
+ # Main loop - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+1:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x3], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x9], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x10], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x11], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x12], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x4], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ LDR q0, [x3], 16 // Load next 6 A
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ LDR q1, [x9], 16
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ LDR q2, [x10], 16
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+ LDR q3, [x11], 16
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ LDR q4, [x12], 16
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ LDR q5, [x4], 16
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ LDP q12, q13, [x5], 32 // Load next 3 B (not last)
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+ LDP q16, q17, [x5], 32
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ SUBS x0, x0, 32
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.HS 1b
+
+ # Epilogue - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+ # First block same as main loop. Second block has no preloads.
+2:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x3], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x9], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x10], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x11], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x12], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x4], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ # Is there a remainder?- 4 floats of A (16 bytes) or less
+ TST x0, 31
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.NE 4f
+
+ # Clamp
+3:
+ FMIN v20.4s, v20.4s, v6.4s
+ SUBS x1, x1, 8
+ FMIN v21.4s, v21.4s, v6.4s
+ FMIN v22.4s, v22.4s, v6.4s
+ FMIN v23.4s, v23.4s, v6.4s
+ FMIN v24.4s, v24.4s, v6.4s
+ FMIN v25.4s, v25.4s, v6.4s
+ FMIN v26.4s, v26.4s, v6.4s
+ FMIN v27.4s, v27.4s, v6.4s
+ FMIN v28.4s, v28.4s, v6.4s
+ FMIN v29.4s, v29.4s, v6.4s
+ FMIN v30.4s, v30.4s, v6.4s
+ FMIN v31.4s, v31.4s, v6.4s
+ # Load cn_stride
+ LDR x0, [sp, 64]
+ FMAX v20.4s, v20.4s, v7.4s
+ FMAX v21.4s, v21.4s, v7.4s
+ FMAX v22.4s, v22.4s, v7.4s
+ FMAX v23.4s, v23.4s, v7.4s
+ FMAX v24.4s, v24.4s, v7.4s
+ FMAX v25.4s, v25.4s, v7.4s
+ FMAX v26.4s, v26.4s, v7.4s
+ FMAX v27.4s, v27.4s, v7.4s
+ FMAX v28.4s, v28.4s, v7.4s
+ FMAX v29.4s, v29.4s, v7.4s
+ FMAX v30.4s, v30.4s, v7.4s
+ FMAX v31.4s, v31.4s, v7.4s
+
+ # Store full 6 x 8
+ B.LO 7f
+
+ STP q20, q21, [x6]
+ ADD x6, x6, x0
+ SUB x3, x3, x2 // a0 -= kc
+ STP q22, q23, [x16]
+ ADD x16, x16, x0
+ SUB x9, x9, x2 // a1 -= kc
+ STP q24, q25, [x17]
+ ADD x17, x17, x0
+ SUB x10, x10, x2 // a2 -= kc
+ STP q26, q27, [x14]
+ ADD x14, x14, x0
+ SUB x11, x11, x2 // a3 -= kc
+ STP q28, q29, [x13]
+ ADD x13, x13, x0
+ SUB x12, x12, x2 // a4 -= kc
+ STP q30, q31, [x7]
+ ADD x7, x7, x0
+ SUB x4, x4, x2 // a5 -= kc
+
+ B.HI 0b
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+4:
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ # Is there a remainder?- 4 floats of A (16 bytes)
+ TBZ x0, 4, 5f
+
+ # Remainder- 4 floats of A (16 bytes)
+ # Load A
+ LDR q0, [x3], 16
+ LDR q1, [x9], 16
+ LDR q2, [x10], 16
+ LDR q3, [x11], 16
+ LDR q4, [x12], 16
+ LDR q5, [x4], 16
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+ LDP q18, q19, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+
+ # Is there a remainder?- 2 floats of A (8 bytes)
+5:
+ TBZ x0, 3, 6f
+
+ # Remainder- 2 floats of A (8 bytes)
+ # Load A
+ LDR d0, [x3], 8
+ LDR d1, [x9], 8
+ LDR d2, [x10], 8
+ LDR d3, [x11], 8
+ LDR d4, [x12], 8
+ LDR d5, [x4], 8
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ # Is there a remainder?- 1 float of A (4 bytes)
+6:
+ TBZ x0, 2, 3b
+
+ # Remainder- 1 float of A (4 bytes)
+ # Load A
+ LDR s0, [x3], 4
+ LDR s1, [x9], 4
+ LDR s2, [x10], 4
+ LDR s3, [x11], 4
+ LDR s4, [x12], 4
+ LDR s5, [x4], 4
+ # Load B
+ LDP q12, q13, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+ B 3b
+
+ # Store odd width
+7:
+ TBZ x1, 2, 8f
+ STR q20, [x6], 16
+ MOV v20.16b, v21.16b
+ STR q22, [x16], 16
+ MOV v22.16b, v23.16b
+ STR q24, [x17], 16
+ MOV v24.16b, v25.16b
+ STR q26, [x14], 16
+ MOV v26.16b, v27.16b
+ STR q28, [x13], 16
+ MOV v28.16b, v29.16b
+ STR q30, [x7], 16
+ MOV v30.16b, v31.16b
+8:
+ TBZ x1, 1, 9f
+ STR d20, [x6], 8
+ DUP d20, v20.d[1]
+ STR d22, [x16], 8
+ DUP d22, v22.d[1]
+ STR d24, [x17], 8
+ DUP d24, v24.d[1]
+ STR d26, [x14], 8
+ DUP d26, v26.d[1]
+ STR d28, [x13], 8
+ DUP d28, v28.d[1]
+ STR d30, [x7], 8
+ DUP d30, v30.d[1]
+
+9:
+ TBZ x1, 0, 10f
+ STR s20, [x6]
+ STR s22, [x16]
+ STR s24, [x17]
+ STR s26, [x14]
+ STR s28, [x13]
+ STR s30, [x7]
+10:
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in b/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
new file mode 100644
index 0000000..2ab6764
--- /dev/null
+++ b/src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
@@ -0,0 +1,694 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const float**restrict a, x4
+# const void*restrict w, x5
+# uint8_t*restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# size_t a_offset, [sp + 8] -> x11
+# const float* zero, [sp + 16] -> x12
+# const xnn_f32_output_params params [sp + 24] -> x8
+
+# d8-d15 need to be preserved if used.
+# x19-30 need to be preserved if used.
+
+# A pointers
+# x14 a0
+# x15 a1
+# x20 a2
+# x21 a3
+# x22 a4
+# x23 a5
+
+# C pointers
+# x6 c0
+# x16 c1
+# x17 c2
+# x10 c3
+# x13 c4
+# x7 c5
+
+# Vector register usage
+# A0 v0 v6
+# A1 v1 v7
+# A2 v2 v8
+# A3 v3 v9
+# A4 v4 v10
+# A5 v5 v11
+# B v12 v13 v14 v15
+# B v16 v17 v18 v19
+# C v20 v21
+# C v22 v23
+# C v24 v25
+# C v26 v27
+# C v28 v29
+# C v30 v31
+# Clamp v6 v7
+
+BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
+
+ # Clamp C pointers / Save d8-d15 on stack
+ STP d8, d9, [sp, -96]!
+ CMP x0, 2 // if mr < 2
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ STP d10, d11, [sp, 16]
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ STP d12, d13, [sp, 32]
+ CMP x0, 4 // if mr < 4
+ ADD x10, x17, x7 // c3 = c2 + cm_stride
+ CSEL x10, x17, x10, LO // c3 = c2
+
+ STP d14, d15, [sp, 48]
+ ADD x13, x10, x7 // c4 = c3 + cm_stride
+ // if mr <= 5
+ CSEL x13, x10, x13, LS // c4 = c3
+
+ # Save x20,x21,x22,x23 on stack
+ STP x20, x21, [sp, 64]
+ STP x22, x23, [sp, 80]
+
+ CMP x0, 6 // if mr < 6
+ ADD x7, x13, x7 // c5 = c4 + cm_stride
+ CSEL x7, x13, x7, LO // c5 = c4
+
+ # Load a_offset
+ LDR x11, [sp, 104]
+
+ # Load zero, clamping params pointer
+ LDP x12, x8, [sp, 112]
+
+0:
+ # Load initial bias from w into accumulators
+ LDP q20, q21, [x5], 32
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 0] // Prefetch B
+ MOV v24.16b, v20.16b
+ MOV v25.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 64]
+ MOV v26.16b, v20.16b
+ MOV v27.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 128]
+ MOV v28.16b, v20.16b
+ MOV v29.16b, v21.16b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 192]
+ MOV v30.16b, v20.16b
+ MOV v31.16b, v21.16b
+
+ MOV x9, x3 // p = ks
+
+1:
+ # Load next 6 A pointers
+ LDP x14, x15, [x4], 16
+ LDP x20, x21, [x4], 16
+ LDP x22, x23, [x4], 16
+
+ CMP x14, x12 // if a0 == zero
+ ADD x14, x14, x11 // a0 += a_offset
+ CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset
+ CMP x15, x12 // if a1 == zero
+ ADD x15, x15, x11 // a1 += a_offset
+ CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset
+ CMP x20, x12 // if a2 == zero
+ ADD x20, x20, x11 // a2 += a_offset
+ CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset
+ CMP x21, x12 // if a3 == zero
+ ADD x21, x21, x11 // a3 += a_offset
+ CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset
+ CMP x22, x12 // if a4 == zero
+ ADD x22, x22, x11 // a4 += a_offset
+ CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset
+ CMP x23, x12 // if a5 == zero
+ ADD x23, x23, x11 // a5 += a_offset
+ CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset
+
+ # Is there at least 8 floats (32 bytes) for prologue + epilogue?
+ SUBS x0, x2, 32 // k = kc - 32
+ B.LO 5f
+
+ # Prologue - loads for main loop of 96 FMA
+ LDR q0, [x14], 16
+ LDR q1, [x15], 16
+ LDR q2, [x20], 16
+ LDR q3, [x21], 16
+ LDR q4, [x22], 16
+ LDR q5, [x23], 16
+ LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+
+ # Is there at least 8 floats (32 bytes) for main loop?
+ SUBS x0, x0, 32
+ B.LO 3f
+
+ # Main loop - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+2:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 128] // Prefetch B
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 256]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x14], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x15], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x20], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x21], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x22], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x23], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ LDR q0, [x14], 16 // Load next 6 A
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ LDR q1, [x15], 16
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ LDR q2, [x20], 16
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+ LDR q3, [x21], 16
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ LDR q4, [x22], 16
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ LDR q5, [x23], 16
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ LDP q12, q13, [x5], 32 // Load next 3 B (not last)
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+ LDP q16, q17, [x5], 32
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ SUBS x0, x0, 32
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.HS 2b
+
+ # Epilogue - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+ # First block same as main loop. Second block has no preloads.
+3:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 128] // Prefetch B
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 256]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x14], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x15], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x20], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x21], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x22], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x23], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ # Is there a remainder?- 4 floats of A (16 bytes) or less
+ TST x0, 31
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.NE 5f
+
+4:
+ # ks loop
+ SUBS x9, x9, 48 // ks -= MR * sizeof(void*)
+ B.HI 1b
+
+ # Clamp
+ FMIN v20.4s, v20.4s, v6.4s
+ FMIN v21.4s, v21.4s, v6.4s
+ FMIN v22.4s, v22.4s, v6.4s
+ FMIN v23.4s, v23.4s, v6.4s
+ FMIN v24.4s, v24.4s, v6.4s
+ FMIN v25.4s, v25.4s, v6.4s
+ FMIN v26.4s, v26.4s, v6.4s
+ FMIN v27.4s, v27.4s, v6.4s
+ FMIN v28.4s, v28.4s, v6.4s
+ FMIN v29.4s, v29.4s, v6.4s
+ FMIN v30.4s, v30.4s, v6.4s
+ FMIN v31.4s, v31.4s, v6.4s
+ # Load cn_stride
+ LDR x0, [sp, 96]
+ FMAX v20.4s, v20.4s, v7.4s
+ FMAX v21.4s, v21.4s, v7.4s
+ FMAX v22.4s, v22.4s, v7.4s
+ FMAX v23.4s, v23.4s, v7.4s
+ FMAX v24.4s, v24.4s, v7.4s
+ FMAX v25.4s, v25.4s, v7.4s
+ FMAX v26.4s, v26.4s, v7.4s
+ FMAX v27.4s, v27.4s, v7.4s
+ FMAX v28.4s, v28.4s, v7.4s
+ FMAX v29.4s, v29.4s, v7.4s
+ FMAX v30.4s, v30.4s, v7.4s
+ FMAX v31.4s, v31.4s, v7.4s
+
+ # Store full 6 x 8
+ SUBS x1, x1, 8
+ B.LO 8f
+
+ STP q30, q31, [x7]
+ ADD x7, x7, x0
+ STP q28, q29, [x13]
+ ADD x13, x13, x0
+ STP q26, q27, [x10]
+ ADD x10, x10, x0
+ STP q24, q25, [x17]
+ ADD x17, x17, x0
+ STP q22, q23, [x16]
+ ADD x16, x16, x0
+ STP q20, q21, [x6]
+ ADD x6, x6, x0
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20,x21,x22,x23 from stack
+ LDP x22, x23, [sp, 80]
+ LDP x20, x21, [sp, 64]
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 96
+ RET
+
+5:
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ # Is there a remainder?- 4 floats of A (16 bytes)
+ TBZ x0, 4, 6f
+
+ # Remainder- 4 floats of A (16 bytes)
+ # Load A
+ LDR q0, [x14], 16
+ LDR q1, [x15], 16
+ LDR q2, [x20], 16
+ LDR q3, [x21], 16
+ LDR q4, [x22], 16
+ LDR q5, [x23], 16
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+ LDP q18, q19, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+
+ # Is there a remainder?- 2 floats of A (8 bytes)
+6:
+ TBZ x0, 3, 7f
+
+ # Remainder- 2 floats of A (8 bytes)
+ # Load A
+ LDR d0, [x14], 8
+ LDR d1, [x15], 8
+ LDR d2, [x20], 8
+ LDR d3, [x21], 8
+ LDR d4, [x22], 8
+ LDR d5, [x23], 8
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ # Is there a remainder?- 1 float of A (4 bytes)
+7:
+ TBZ x0, 2, 4b
+
+ # Remainder- 1 float of A (4 bytes)
+ # Load A
+ LDR s0, [x14], 4
+ LDR s1, [x15], 4
+ LDR s2, [x20], 4
+ LDR s3, [x21], 4
+ LDR s4, [x22], 4
+ LDR s5, [x23], 4
+ # Load B
+ LDP q12, q13, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+ B 4b
+
+ # Store odd width
+8:
+ TBZ x1, 2, 9f
+ STR q30, [x7], 16
+ MOV v30.16b, v31.16b
+ STR q28, [x13], 16
+ MOV v28.16b, v29.16b
+ STR q26, [x10], 16
+ MOV v26.16b, v27.16b
+ STR q24, [x17], 16
+ MOV v24.16b, v25.16b
+ STR q22, [x16], 16
+ MOV v22.16b, v23.16b
+ STR q20, [x6], 16
+ MOV v20.16b, v21.16b
+9:
+ TBZ x1, 1, 10f
+ STR d30, [x7], 8
+ DUP d30, v30.d[1]
+ STR d28, [x13], 8
+ DUP d28, v28.d[1]
+ STR d26, [x10], 8
+ DUP d26, v26.d[1]
+ STR d24, [x17], 8
+ DUP d24, v24.d[1]
+ STR d22, [x16], 8
+ DUP d22, v22.d[1]
+ STR d20, [x6], 8
+ DUP d20, v20.d[1]
+
+10:
+ TBZ x1, 0, 11f
+ STR s30, [x7]
+ STR s28, [x13]
+ STR s26, [x10]
+ STR s24, [x17]
+ STR s22, [x16]
+ STR s20, [x6]
+11:
+ # Restore x20,x21,x22,x23 from stack
+ LDP x22, x23, [sp, 80]
+ LDP x20, x21, [sp, 64]
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 96
+ RET
+
+END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S b/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S
new file mode 100644
index 0000000..5b30b51
--- /dev/null
+++ b/src/f32-igemm/gen/6x8-aarch64-neonfma-ios.S
@@ -0,0 +1,682 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-igemm/6x8-aarch64-neonfma-ios.S.in
+// Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const float**restrict a, x4
+# const void*restrict w, x5
+# uint8_t*restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# size_t a_offset, [sp + 8] -> x11
+# const float* zero, [sp + 16] -> x12
+# const xnn_f32_output_params params [sp + 24] -> x8
+
+# d8-d15 need to be preserved if used.
+# x19-30 need to be preserved if used.
+
+# A pointers
+# x14 a0
+# x15 a1
+# x20 a2
+# x21 a3
+# x22 a4
+# x23 a5
+
+# C pointers
+# x6 c0
+# x16 c1
+# x17 c2
+# x10 c3
+# x13 c4
+# x7 c5
+
+# Vector register usage
+# A0 v0 v6
+# A1 v1 v7
+# A2 v2 v8
+# A3 v3 v9
+# A4 v4 v10
+# A5 v5 v11
+# B v12 v13 v14 v15
+# B v16 v17 v18 v19
+# C v20 v21
+# C v22 v23
+# C v24 v25
+# C v26 v27
+# C v28 v29
+# C v30 v31
+# Clamp v6 v7
+
+BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
+
+ # Clamp C pointers / Save d8-d15 on stack
+ STP d8, d9, [sp, -96]!
+ CMP x0, 2 // if mr < 2
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ STP d10, d11, [sp, 16]
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ STP d12, d13, [sp, 32]
+ CMP x0, 4 // if mr < 4
+ ADD x10, x17, x7 // c3 = c2 + cm_stride
+ CSEL x10, x17, x10, LO // c3 = c2
+
+ STP d14, d15, [sp, 48]
+ ADD x13, x10, x7 // c4 = c3 + cm_stride
+ // if mr <= 5
+ CSEL x13, x10, x13, LS // c4 = c3
+
+ # Save x20,x21,x22,x23 on stack
+ STP x20, x21, [sp, 64]
+ STP x22, x23, [sp, 80]
+
+ CMP x0, 6 // if mr < 6
+ ADD x7, x13, x7 // c5 = c4 + cm_stride
+ CSEL x7, x13, x7, LO // c5 = c4
+
+ # Load a_offset
+ LDR x11, [sp, 104]
+
+ # Load zero, clamping params pointer
+ LDP x12, x8, [sp, 112]
+
+0:
+ # Load initial bias from w into accumulators
+ LDP q20, q21, [x5], 32
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v21.16b
+ MOV v24.16b, v20.16b
+ MOV v25.16b, v21.16b
+ MOV v26.16b, v20.16b
+ MOV v27.16b, v21.16b
+ MOV v28.16b, v20.16b
+ MOV v29.16b, v21.16b
+ MOV v30.16b, v20.16b
+ MOV v31.16b, v21.16b
+
+ MOV x9, x3 // p = ks
+
+1:
+ # Load next 6 A pointers
+ LDP x14, x15, [x4], 16
+ LDP x20, x21, [x4], 16
+ LDP x22, x23, [x4], 16
+
+ CMP x14, x12 // if a0 == zero
+ ADD x14, x14, x11 // a0 += a_offset
+ CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset
+ CMP x15, x12 // if a1 == zero
+ ADD x15, x15, x11 // a1 += a_offset
+ CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset
+ CMP x20, x12 // if a2 == zero
+ ADD x20, x20, x11 // a2 += a_offset
+ CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset
+ CMP x21, x12 // if a3 == zero
+ ADD x21, x21, x11 // a3 += a_offset
+ CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset
+ CMP x22, x12 // if a4 == zero
+ ADD x22, x22, x11 // a4 += a_offset
+ CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset
+ CMP x23, x12 // if a5 == zero
+ ADD x23, x23, x11 // a5 += a_offset
+ CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset
+
+ # Is there at least 8 floats (32 bytes) for prologue + epilogue?
+ SUBS x0, x2, 32 // k = kc - 32
+ B.LO 5f
+
+ # Prologue - loads for main loop of 96 FMA
+ LDR q0, [x14], 16
+ LDR q1, [x15], 16
+ LDR q2, [x20], 16
+ LDR q3, [x21], 16
+ LDR q4, [x22], 16
+ LDR q5, [x23], 16
+ LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred)
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+
+ # Is there at least 8 floats (32 bytes) for main loop?
+ SUBS x0, x0, 32
+ B.LO 3f
+
+ # Main loop - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+2:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x14], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x15], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x20], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x21], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x22], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x23], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ LDR q0, [x14], 16 // Load next 6 A
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ LDR q1, [x15], 16
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ LDR q2, [x20], 16
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+ LDR q3, [x21], 16
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ LDR q4, [x22], 16
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ LDR q5, [x23], 16
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ LDP q12, q13, [x5], 32 // Load next 3 B (not last)
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+ LDP q16, q17, [x5], 32
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ SUBS x0, x0, 32
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.HS 2b
+
+ # Epilogue - 8 floats of A (32 bytes)
+ # 96 FMA + 6 LDP A + 8 LDP B
+ # First block same as main loop. Second block has no preloads.
+3:
+ # First group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v0.s[0]
+ LDP q18, q19, [x5], 32 // Load last B
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+
+ FMLA v31.4s, v13.4s, v5.s[0]
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ LDR q6, [x14], 16 // Load next 6 A
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+ LDR q7, [x15], 16
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ LDR q8, [x20], 16
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ LDR q9, [x21], 16
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ LDR q10, [x22], 16
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+ LDR q11, [x23], 16
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ LDP q12, q13, [x5], 32 // Load 4 B
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ LDP q14, q15, [x5], 32
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ LDP q16, q17, [x5], 32
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+ LDP q18, q19, [x5], 32
+
+ # Second group of 4 A. 48 FMA.
+ FMLA v20.4s, v12.4s, v6.s[0]
+ FMLA v22.4s, v12.4s, v7.s[0]
+ FMLA v24.4s, v12.4s, v8.s[0]
+ FMLA v26.4s, v12.4s, v9.s[0]
+ FMLA v28.4s, v12.4s, v10.s[0]
+ FMLA v30.4s, v12.4s, v11.s[0]
+ FMLA v21.4s, v13.4s, v6.s[0]
+ FMLA v23.4s, v13.4s, v7.s[0]
+ FMLA v25.4s, v13.4s, v8.s[0]
+ FMLA v27.4s, v13.4s, v9.s[0]
+ FMLA v29.4s, v13.4s, v10.s[0]
+ FMLA v31.4s, v13.4s, v11.s[0]
+
+ FMLA v20.4s, v14.4s, v6.s[1]
+ FMLA v22.4s, v14.4s, v7.s[1]
+ FMLA v24.4s, v14.4s, v8.s[1]
+ FMLA v26.4s, v14.4s, v9.s[1]
+ FMLA v28.4s, v14.4s, v10.s[1]
+ FMLA v30.4s, v14.4s, v11.s[1]
+ FMLA v21.4s, v15.4s, v6.s[1]
+ FMLA v23.4s, v15.4s, v7.s[1]
+ FMLA v25.4s, v15.4s, v8.s[1]
+ FMLA v27.4s, v15.4s, v9.s[1]
+ FMLA v29.4s, v15.4s, v10.s[1]
+ FMLA v31.4s, v15.4s, v11.s[1]
+
+ FMLA v20.4s, v16.4s, v6.s[2]
+ FMLA v22.4s, v16.4s, v7.s[2]
+ FMLA v24.4s, v16.4s, v8.s[2]
+ FMLA v26.4s, v16.4s, v9.s[2]
+ FMLA v28.4s, v16.4s, v10.s[2]
+ FMLA v30.4s, v16.4s, v11.s[2]
+ FMLA v21.4s, v17.4s, v6.s[2]
+ FMLA v23.4s, v17.4s, v7.s[2]
+ FMLA v25.4s, v17.4s, v8.s[2]
+ FMLA v27.4s, v17.4s, v9.s[2]
+ FMLA v29.4s, v17.4s, v10.s[2]
+ FMLA v31.4s, v17.4s, v11.s[2]
+
+ FMLA v20.4s, v18.4s, v6.s[3]
+ FMLA v22.4s, v18.4s, v7.s[3]
+ FMLA v24.4s, v18.4s, v8.s[3]
+ FMLA v26.4s, v18.4s, v9.s[3]
+ FMLA v28.4s, v18.4s, v10.s[3]
+ FMLA v30.4s, v18.4s, v11.s[3]
+ FMLA v21.4s, v19.4s, v6.s[3]
+ FMLA v23.4s, v19.4s, v7.s[3]
+
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ FMLA v25.4s, v19.4s, v8.s[3]
+ FMLA v27.4s, v19.4s, v9.s[3]
+ # Is there a remainder?- 4 floats of A (16 bytes) or less
+ TST x0, 31
+ FMLA v29.4s, v19.4s, v10.s[3]
+ FMLA v31.4s, v19.4s, v11.s[3]
+ B.NE 5f
+
+4:
+ # ks loop
+ SUBS x9, x9, 48 // ks -= MR * sizeof(void*)
+ B.HI 1b
+
+ # Clamp
+ FMIN v20.4s, v20.4s, v6.4s
+ FMIN v21.4s, v21.4s, v6.4s
+ FMIN v22.4s, v22.4s, v6.4s
+ FMIN v23.4s, v23.4s, v6.4s
+ FMIN v24.4s, v24.4s, v6.4s
+ FMIN v25.4s, v25.4s, v6.4s
+ FMIN v26.4s, v26.4s, v6.4s
+ FMIN v27.4s, v27.4s, v6.4s
+ FMIN v28.4s, v28.4s, v6.4s
+ FMIN v29.4s, v29.4s, v6.4s
+ FMIN v30.4s, v30.4s, v6.4s
+ FMIN v31.4s, v31.4s, v6.4s
+ # Load cn_stride
+ LDR x0, [sp, 96]
+ FMAX v20.4s, v20.4s, v7.4s
+ FMAX v21.4s, v21.4s, v7.4s
+ FMAX v22.4s, v22.4s, v7.4s
+ FMAX v23.4s, v23.4s, v7.4s
+ FMAX v24.4s, v24.4s, v7.4s
+ FMAX v25.4s, v25.4s, v7.4s
+ FMAX v26.4s, v26.4s, v7.4s
+ FMAX v27.4s, v27.4s, v7.4s
+ FMAX v28.4s, v28.4s, v7.4s
+ FMAX v29.4s, v29.4s, v7.4s
+ FMAX v30.4s, v30.4s, v7.4s
+ FMAX v31.4s, v31.4s, v7.4s
+
+ # Store full 6 x 8
+ SUBS x1, x1, 8
+ B.LO 8f
+
+ STP q30, q31, [x7]
+ ADD x7, x7, x0
+ STP q28, q29, [x13]
+ ADD x13, x13, x0
+ STP q26, q27, [x10]
+ ADD x10, x10, x0
+ STP q24, q25, [x17]
+ ADD x17, x17, x0
+ STP q22, q23, [x16]
+ ADD x16, x16, x0
+ STP q20, q21, [x6]
+ ADD x6, x6, x0
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20,x21,x22,x23 from stack
+ LDP x22, x23, [sp, 80]
+ LDP x20, x21, [sp, 64]
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 96
+ RET
+
+5:
+ # Load clamping_params values
+ LD2R {v6.4s, v7.4s}, [x8]
+
+ # Is there a remainder?- 4 floats of A (16 bytes)
+ TBZ x0, 4, 6f
+
+ # Remainder- 4 floats of A (16 bytes)
+ # Load A
+ LDR q0, [x14], 16
+ LDR q1, [x15], 16
+ LDR q2, [x20], 16
+ LDR q3, [x21], 16
+ LDR q4, [x22], 16
+ LDR q5, [x23], 16
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+ LDP q16, q17, [x5], 32
+ LDP q18, q19, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ FMLA v20.4s, v16.4s, v0.s[2]
+ FMLA v22.4s, v16.4s, v1.s[2]
+ FMLA v24.4s, v16.4s, v2.s[2]
+ FMLA v26.4s, v16.4s, v3.s[2]
+ FMLA v28.4s, v16.4s, v4.s[2]
+ FMLA v30.4s, v16.4s, v5.s[2]
+ FMLA v21.4s, v17.4s, v0.s[2]
+ FMLA v23.4s, v17.4s, v1.s[2]
+ FMLA v25.4s, v17.4s, v2.s[2]
+ FMLA v27.4s, v17.4s, v3.s[2]
+ FMLA v29.4s, v17.4s, v4.s[2]
+ FMLA v31.4s, v17.4s, v5.s[2]
+
+ FMLA v20.4s, v18.4s, v0.s[3]
+ FMLA v22.4s, v18.4s, v1.s[3]
+ FMLA v24.4s, v18.4s, v2.s[3]
+ FMLA v26.4s, v18.4s, v3.s[3]
+ FMLA v28.4s, v18.4s, v4.s[3]
+ FMLA v30.4s, v18.4s, v5.s[3]
+ FMLA v21.4s, v19.4s, v0.s[3]
+ FMLA v23.4s, v19.4s, v1.s[3]
+ FMLA v25.4s, v19.4s, v2.s[3]
+ FMLA v27.4s, v19.4s, v3.s[3]
+ FMLA v29.4s, v19.4s, v4.s[3]
+ FMLA v31.4s, v19.4s, v5.s[3]
+
+ # Is there a remainder?- 2 floats of A (8 bytes)
+6:
+ TBZ x0, 3, 7f
+
+ # Remainder- 2 floats of A (8 bytes)
+ # Load A
+ LDR d0, [x14], 8
+ LDR d1, [x15], 8
+ LDR d2, [x20], 8
+ LDR d3, [x21], 8
+ LDR d4, [x22], 8
+ LDR d5, [x23], 8
+ # Load B
+ LDP q12, q13, [x5], 32
+ LDP q14, q15, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+
+ FMLA v20.4s, v14.4s, v0.s[1]
+ FMLA v22.4s, v14.4s, v1.s[1]
+ FMLA v24.4s, v14.4s, v2.s[1]
+ FMLA v26.4s, v14.4s, v3.s[1]
+ FMLA v28.4s, v14.4s, v4.s[1]
+ FMLA v30.4s, v14.4s, v5.s[1]
+ FMLA v21.4s, v15.4s, v0.s[1]
+ FMLA v23.4s, v15.4s, v1.s[1]
+ FMLA v25.4s, v15.4s, v2.s[1]
+ FMLA v27.4s, v15.4s, v3.s[1]
+ FMLA v29.4s, v15.4s, v4.s[1]
+ FMLA v31.4s, v15.4s, v5.s[1]
+
+ # Is there a remainder?- 1 float of A (4 bytes)
+7:
+ TBZ x0, 2, 4b
+
+ # Remainder- 1 float of A (4 bytes)
+ # Load A
+ LDR s0, [x14], 4
+ LDR s1, [x15], 4
+ LDR s2, [x20], 4
+ LDR s3, [x21], 4
+ LDR s4, [x22], 4
+ LDR s5, [x23], 4
+ # Load B
+ LDP q12, q13, [x5], 32
+
+ FMLA v20.4s, v12.4s, v0.s[0]
+ FMLA v22.4s, v12.4s, v1.s[0]
+ FMLA v24.4s, v12.4s, v2.s[0]
+ FMLA v26.4s, v12.4s, v3.s[0]
+ FMLA v28.4s, v12.4s, v4.s[0]
+ FMLA v30.4s, v12.4s, v5.s[0]
+ FMLA v21.4s, v13.4s, v0.s[0]
+ FMLA v23.4s, v13.4s, v1.s[0]
+ FMLA v25.4s, v13.4s, v2.s[0]
+ FMLA v27.4s, v13.4s, v3.s[0]
+ FMLA v29.4s, v13.4s, v4.s[0]
+ FMLA v31.4s, v13.4s, v5.s[0]
+ B 4b
+
+ # Store odd width
+8:
+ TBZ x1, 2, 9f
+ STR q30, [x7], 16
+ MOV v30.16b, v31.16b
+ STR q28, [x13], 16
+ MOV v28.16b, v29.16b
+ STR q26, [x10], 16
+ MOV v26.16b, v27.16b
+ STR q24, [x17], 16
+ MOV v24.16b, v25.16b
+ STR q22, [x16], 16
+ MOV v22.16b, v23.16b
+ STR q20, [x6], 16
+ MOV v20.16b, v21.16b
+9:
+ TBZ x1, 1, 10f
+ STR d30, [x7], 8
+ DUP d30, v30.d[1]
+ STR d28, [x13], 8
+ DUP d28, v28.d[1]
+ STR d26, [x10], 8
+ DUP d26, v26.d[1]
+ STR d24, [x17], 8
+ DUP d24, v24.d[1]
+ STR d22, [x16], 8
+ DUP d22, v22.d[1]
+ STR d20, [x6], 8
+ DUP d20, v20.d[1]
+
+10:
+ TBZ x1, 0, 11f
+ STR s30, [x7]
+ STR s28, [x13]
+ STR s26, [x10]
+ STR s24, [x17]
+ STR s22, [x16]
+ STR s20, [x6]
+11:
+ # Restore x20,x21,x22,x23 from stack
+ LDP x22, x23, [sp, 80]
+ LDP x20, x21, [sp, 64]
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 96
+ RET
+
+END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/init.c b/src/init.c
index 8aa43bc..18d7522 100644
--- a/src/init.c
+++ b/src/init.c
@@ -80,8 +80,7 @@
.nr = 8,
};
- // TODO: Test assembly for IOS.
- #if XNN_ENABLE_ASSEMBLY && !defined(TARGET_OS_IPHONE)
+ #if XNN_ENABLE_ASSEMBLY
xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
.up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up8x9__aarch32_neon,
.cr = 8,
@@ -388,120 +387,131 @@
/**************************** F32 micro-kernels ****************************/
#ifndef XNN_NO_F32_OPERATORS
- // TODO: Review assembly for IOS.
- #if defined(TARGET_OS_IPHONE)
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
- .mr = 6,
- .nr = 8,
- };
- #elif XNN_ENABLE_ASSEMBLY
- switch (cpuinfo_get_core(0)->uarch) {
- case cpuinfo_uarch_cortex_a57:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
- .mr = 6,
- .nr = 8,
- };
- break;
- case cpuinfo_uarch_cortex_a72:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
- .mr = 4,
- .nr = 8,
- };
- break;
- case cpuinfo_uarch_cortex_a75:
- case cpuinfo_uarch_cortex_a76:
- case cpuinfo_uarch_exynos_m3:
- case cpuinfo_uarch_exynos_m4:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
- .mr = 6,
- .nr = 8,
- };
- break;
- case cpuinfo_uarch_exynos_m1:
- case cpuinfo_uarch_exynos_m2:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8s4__neonfma,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8s4__neonfma,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8s4__neonfma,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8s4__neonfma,
- .mr = 6,
- .nr = 8,
- .log2_sr = 2,
- };
- break;
+ #if XNN_PLATFORM_IOS
+ #if XNN_ENABLE_ASSEMBLY
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ .mr = 6,
+ .nr = 8,
+ };
+ #else // !XNN_ENABLE_ASSEMBLY
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
+ .mr = 6,
+ .nr = 8,
+ };
+ #endif // XNN_ENABLE_ASSEMBLY
+ #else // !XNN_PLATFORM_IOS
+ #if XNN_ENABLE_ASSEMBLY
+ switch (cpuinfo_get_core(0)->uarch) {
+ case cpuinfo_uarch_cortex_a57:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ .mr = 6,
+ .nr = 8,
+ };
+ break;
+ case cpuinfo_uarch_cortex_a72:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
+ .mr = 4,
+ .nr = 8,
+ };
+ break;
+ case cpuinfo_uarch_cortex_a75:
+ case cpuinfo_uarch_cortex_a76:
+ case cpuinfo_uarch_exynos_m3:
+ case cpuinfo_uarch_exynos_m4:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
+ .mr = 6,
+ .nr = 8,
+ };
+ break;
+ case cpuinfo_uarch_exynos_m1:
+ case cpuinfo_uarch_exynos_m2:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8s4__neonfma,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8s4__neonfma,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8s4__neonfma,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8s4__neonfma,
+ .mr = 6,
+ .nr = 8,
+ .log2_sr = 2,
+ };
+ break;
- case cpuinfo_uarch_cortex_a53:
- case cpuinfo_uarch_cortex_a55r0:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
- .mr = 6,
- .nr = 8,
- };
- break;
- case cpuinfo_uarch_cortex_a55:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
- .mr = 6,
- .nr = 8,
- };
- break;
- case cpuinfo_uarch_cortex_a73:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
- .mr = 6,
- .nr = 8,
- };
- break;
- default:
- case cpuinfo_uarch_cortex_a77:
- case cpuinfo_uarch_exynos_m5:
- case cpuinfo_uarch_kryo:
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
- .mr = 4,
- .nr = 8,
- };
- break;
- }
- #else // XNN_ENABLE_ASSEMBLY
- xnn_params.f32.gemm = (struct gemm_parameters) {
- .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
- .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
- .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
- .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
- .mr = 6,
- .nr = 8,
- };
- #endif // XNN_ENABLE_ASSEMBLY
-
+ case cpuinfo_uarch_cortex_a53:
+ case cpuinfo_uarch_cortex_a55r0:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+ .mr = 6,
+ .nr = 8,
+ };
+ break;
+ case cpuinfo_uarch_cortex_a55:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
+ .mr = 6,
+ .nr = 8,
+ };
+ break;
+ case cpuinfo_uarch_cortex_a73:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
+ .mr = 6,
+ .nr = 8,
+ };
+ break;
+ default:
+ case cpuinfo_uarch_cortex_a77:
+ case cpuinfo_uarch_exynos_m5:
+ case cpuinfo_uarch_kryo:
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
+ .mr = 4,
+ .nr = 8,
+ };
+ break;
+ }
+ #else // !XNN_ENABLE_ASSEMBLY
+ xnn_params.f32.gemm = (struct gemm_parameters) {
+ .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
+ .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
+ .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
+ .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
+ .mr = 6,
+ .nr = 8,
+ };
+ #endif // XNN_ENABLE_ASSEMBLY
+ #endif // XNN_PLATFORM_IOS
xnn_params.f32.gemm2 = (struct gemm_parameters) {
.gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64,
.igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64,
@@ -513,33 +523,41 @@
.cr = 4,
.mr = 4,
};
- switch (cpuinfo_get_core(0)->uarch) {
- case cpuinfo_uarch_kryo:
- xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
- .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__neonfma,
- .cr = 4,
- .mr = 9,
- };
- break;
-#if XNN_ENABLE_ASSEMBLY && !defined(TARGET_OS_IPHONE)
- case cpuinfo_uarch_cortex_a53:
- case cpuinfo_uarch_cortex_a55r0:
- case cpuinfo_uarch_cortex_a55:
- xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
- .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55,
- .cr = 4,
- .mr = 9,
- };
- break;
-#endif
- default:
- xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
- .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__neonfma,
- .cr = 8,
- .mr = 9,
- };
- break;
- }
+ #if XNN_PLATFORM_IOS
+ xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
+ .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__neonfma,
+ .cr = 8,
+ .mr = 9,
+ };
+ #else // !XNN_PLATFORM_IOS
+ switch (cpuinfo_get_core(0)->uarch) {
+ case cpuinfo_uarch_kryo:
+ xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
+ .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__neonfma,
+ .cr = 4,
+ .mr = 9,
+ };
+ break;
+ #if XNN_ENABLE_ASSEMBLY
+ case cpuinfo_uarch_cortex_a53:
+ case cpuinfo_uarch_cortex_a55r0:
+ case cpuinfo_uarch_cortex_a55:
+ xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
+ .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55,
+ .cr = 4,
+ .mr = 9,
+ };
+ break;
+ #endif // XNN_ENABLE_ASSEMBLY
+ default:
+ xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
+ .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__neonfma,
+ .cr = 8,
+ .mr = 9,
+ };
+ break;
+ }
+ #endif // XNN_PLATFORM_IOS
xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
.up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__psimd,
.cr = 4,
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 7751864..ca8383a 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -94,6 +94,7 @@
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128)
DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64)
@@ -233,6 +234,7 @@
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128)
DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index d07d53f..154544b 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -87,6 +87,7 @@
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index ec9b4b4..e56931e 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -7037,6 +7037,507 @@
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, qmin) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, qmax) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+
+
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index 4ad8646..2408331 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -58,6 +58,10 @@
k-block: 8
pipelined: true
assembly: true
+- name: xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios
+ k-block: 8
+ pipelined: true
+ assembly: true
- name: xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53
k-block: 4
pipelined: true
diff --git a/test/f32-gemminc.cc b/test/f32-gemminc.cc
index ead6477..0a6da77 100644
--- a/test/f32-gemminc.cc
+++ b/test/f32-gemminc.cc
@@ -6491,6 +6491,507 @@
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, qmin) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, qmax) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
+ }
+#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+
+
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index cb4670a..08d7c53 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -53,6 +53,10 @@
k-block: 8
pipelined: true
assembly: true
+- name: xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios
+ k-block: 8
+ pipelined: true
+ assembly: true
- name: xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53
k-block: 4
pipelined: true
diff --git a/test/f32-igemm.cc b/test/f32-igemm.cc
index 64d2bb2..fa54ce7 100644
--- a/test/f32-igemm.cc
+++ b/test/f32-igemm.cc
@@ -9473,6 +9473,505 @@
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(16)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 17; k < 16; k++) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 24; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 6; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, a_offset) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(251)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, zero) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t mz = 0; mz < 6; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(251)
+ .zero_index(mz)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+ }
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, qmin) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, qmax) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+
+ TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(6)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(6)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
+ }
+#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
+
+
+#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
diff --git a/test/f32-igemm.yaml b/test/f32-igemm.yaml
index 5d742c3..4801bd2 100644
--- a/test/f32-igemm.yaml
+++ b/test/f32-igemm.yaml
@@ -73,6 +73,10 @@
k-block: 8
pipelined: true
assembly: true
+- name: xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios
+ k-block: 8
+ pipelined: true
+ assembly: true
- name: xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53
k-block: 4
pipelined: true