4x8 GEMM for Cortex A53
36.6% faster than previous 4x8 a53 kernel.
Based on 6x8 GEMM trimmed down to 4 rows
PiperOrigin-RevId: 280465479
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index 0ddb3c1..5f027a7 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -1526,7 +1526,7 @@
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
.mr(4)
@@ -1535,7 +1535,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
@@ -1548,12 +1548,12 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.cn_stride(11)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_strided_a) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
.mr(4)
@@ -1562,12 +1562,12 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
- .a_stride(5)
+ .k(4)
+ .a_stride(7)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
@@ -1578,14 +1578,14 @@
.sr(1)
.m(m)
.n(n)
- .k(2)
+ .k(4)
.iterations(1)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_m) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
@@ -1595,13 +1595,13 @@
.sr(1)
.m(m)
.n(8)
- .k(2)
+ .k(4)
.iterations(1)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_n) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1611,15 +1611,60 @@
.sr(1)
.m(4)
.n(n)
- .k(2)
+ .k(4)
.iterations(1)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k < 2; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
+ }
+
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
+ }
+ }
+ }
+
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1632,9 +1677,9 @@
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_strided_a) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k < 2; k++) {
+ for (size_t k = 1; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1643,14 +1688,14 @@
.m(4)
.n(8)
.k(k)
- .a_stride(5)
+ .a_stride(11)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_subtile) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k < 2; k++) {
+ for (size_t k = 1; k < 8; k++) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1668,9 +1713,9 @@
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 3; k < 4; k++) {
+ for (size_t k = 9; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1683,9 +1728,9 @@
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_strided_a) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 3; k < 4; k++) {
+ for (size_t k = 9; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1694,14 +1739,14 @@
.m(4)
.n(8)
.k(k)
- .a_stride(7)
+ .a_stride(11)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_subtile) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 3; k < 4; k++) {
+ for (size_t k = 9; k < 8; k++) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1719,9 +1764,9 @@
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 4; k <= 20; k += 2) {
+ for (size_t k = 12; k <= 40; k += 4) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1734,9 +1779,9 @@
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_strided_a) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 4; k <= 20; k += 2) {
+ for (size_t k = 12; k <= 40; k += 4) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1745,14 +1790,14 @@
.m(4)
.n(8)
.k(k)
- .a_stride(23)
+ .a_stride(43)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_subtile) {
+ TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 4; k <= 20; k += 2) {
+ for (size_t k = 12; k <= 40; k += 4) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1773,7 +1818,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1790,7 +1835,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1808,7 +1853,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1817,7 +1862,7 @@
.m(4)
.n(n)
.k(k)
- .a_stride(13)
+ .a_stride(23)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
@@ -1826,7 +1871,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
.mr(4)
@@ -1846,7 +1891,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1863,7 +1908,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1881,7 +1926,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1890,7 +1935,7 @@
.m(4)
.n(n)
.k(k)
- .a_stride(13)
+ .a_stride(23)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
@@ -1899,7 +1944,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
.mr(4)
@@ -1918,7 +1963,7 @@
TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1946,7 +1991,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.qmin(128)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
@@ -1960,7 +2005,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.qmax(128)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
@@ -1974,7 +2019,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.cm_stride(11)
.Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index f62d47f..5a98c5e 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -15,7 +15,8 @@
pipelined: true
assembly: true
- name: xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53
- k-block: 2
+ k-block: 4
+ pipelined: true
assembly: true
- name: xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57
k-block: 8
diff --git a/test/f32-gemminc.cc b/test/f32-gemminc.cc
index 544dbdf..d65f52c 100644
--- a/test/f32-gemminc.cc
+++ b/test/f32-gemminc.cc
@@ -1526,7 +1526,7 @@
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
.mr(4)
@@ -1535,7 +1535,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
@@ -1548,12 +1548,12 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.cn_stride(11)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_strided_a) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
GemmMicrokernelTester()
.mr(4)
@@ -1562,12 +1562,12 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
- .a_stride(5)
+ .k(4)
+ .a_stride(7)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
@@ -1578,14 +1578,14 @@
.sr(1)
.m(m)
.n(n)
- .k(2)
+ .k(4)
.iterations(1)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_m) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
@@ -1595,13 +1595,13 @@
.sr(1)
.m(m)
.n(8)
- .k(2)
+ .k(4)
.iterations(1)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_n) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1611,15 +1611,60 @@
.sr(1)
.m(4)
.n(n)
- .k(2)
+ .k(4)
.iterations(1)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k < 2; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
+ }
+
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
+ }
+
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
+ }
+ }
+ }
+
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_FMA;
+ for (size_t k = 1; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1632,9 +1677,9 @@
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_strided_a) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k < 2; k++) {
+ for (size_t k = 1; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1643,14 +1688,14 @@
.m(4)
.n(8)
.k(k)
- .a_stride(5)
+ .a_stride(11)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_subtile) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k < 2; k++) {
+ for (size_t k = 1; k < 8; k++) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1668,9 +1713,9 @@
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 3; k < 4; k++) {
+ for (size_t k = 9; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1683,9 +1728,9 @@
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_strided_a) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 3; k < 4; k++) {
+ for (size_t k = 9; k < 8; k++) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1694,14 +1739,14 @@
.m(4)
.n(8)
.k(k)
- .a_stride(7)
+ .a_stride(11)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_subtile) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 3; k < 4; k++) {
+ for (size_t k = 9; k < 8; k++) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1719,9 +1764,9 @@
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 4; k <= 20; k += 2) {
+ for (size_t k = 12; k <= 40; k += 4) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1734,9 +1779,9 @@
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_strided_a) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 4; k <= 20; k += 2) {
+ for (size_t k = 12; k <= 40; k += 4) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1745,14 +1790,14 @@
.m(4)
.n(8)
.k(k)
- .a_stride(23)
+ .a_stride(43)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
- TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_subtile) {
+ TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 4; k <= 20; k += 2) {
+ for (size_t k = 12; k <= 40; k += 4) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1773,7 +1818,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1790,7 +1835,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1808,7 +1853,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1817,7 +1862,7 @@
.m(4)
.n(n)
.k(k)
- .a_stride(13)
+ .a_stride(23)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
@@ -1826,7 +1871,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
.mr(4)
@@ -1846,7 +1891,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1863,7 +1908,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1881,7 +1926,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
GemmMicrokernelTester()
.mr(4)
.nr(8)
@@ -1890,7 +1935,7 @@
.m(4)
.n(n)
.k(k)
- .a_stride(13)
+ .a_stride(23)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
}
@@ -1899,7 +1944,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
.mr(4)
@@ -1918,7 +1963,7 @@
TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t k = 1; k <= 10; k += 3) {
+ for (size_t k = 1; k <= 20; k += 5) {
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 8; n++) {
GemmMicrokernelTester()
@@ -1946,7 +1991,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.qmin(128)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
@@ -1960,7 +2005,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.qmax(128)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
@@ -1974,7 +2019,7 @@
.sr(1)
.m(4)
.n(8)
- .k(2)
+ .k(4)
.cm_stride(11)
.Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
}
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index 91b4f5b..4e52c3d 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -15,7 +15,8 @@
pipelined: true
assembly: true
- name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53
- k-block: 2
+ k-block: 4
+ pipelined: true
assembly: true
- name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57
k-block: 8