4x8 GEMM for Cortex A53

36.6% faster than previous 4x8 a53 kernel.

Based on 6x8 GEMM trimmed down to 4 rows

PiperOrigin-RevId: 280465479
diff --git a/test/f32-gemm.cc b/test/f32-gemm.cc
index 0ddb3c1..5f027a7 100644
--- a/test/f32-gemm.cc
+++ b/test/f32-gemm.cc
@@ -1526,7 +1526,7 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -1535,7 +1535,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
 
@@ -1548,12 +1548,12 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .cn_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_strided_a) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -1562,12 +1562,12 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
-      .a_stride(5)
+      .k(4)
+      .a_stride(7)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1578,14 +1578,14 @@
           .sr(1)
           .m(m)
           .n(n)
-          .k(2)
+          .k(4)
           .iterations(1)
           .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
       }
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_m) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -1595,13 +1595,13 @@
         .sr(1)
         .m(m)
         .n(8)
-        .k(2)
+        .k(4)
         .iterations(1)
         .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_n) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1611,15 +1611,60 @@
         .sr(1)
         .m(4)
         .n(n)
-        .k(2)
+        .k(4)
         .iterations(1)
         .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k < 2; k++) {
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
+  }
+
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
+  }
+
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
+      }
+    }
+  }
+
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1632,9 +1677,9 @@
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_strided_a) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k < 2; k++) {
+    for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1643,14 +1688,14 @@
         .m(4)
         .n(8)
         .k(k)
-        .a_stride(5)
+        .a_stride(11)
         .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_subtile) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k < 2; k++) {
+    for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1668,9 +1713,9 @@
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 3; k < 4; k++) {
+    for (size_t k = 9; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1683,9 +1728,9 @@
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_strided_a) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 3; k < 4; k++) {
+    for (size_t k = 9; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1694,14 +1739,14 @@
         .m(4)
         .n(8)
         .k(k)
-        .a_stride(7)
+        .a_stride(11)
         .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_subtile) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 3; k < 4; k++) {
+    for (size_t k = 9; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1719,9 +1764,9 @@
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 4; k <= 20; k += 2) {
+    for (size_t k = 12; k <= 40; k += 4) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1734,9 +1779,9 @@
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_strided_a) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 4; k <= 20; k += 2) {
+    for (size_t k = 12; k <= 40; k += 4) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1745,14 +1790,14 @@
         .m(4)
         .n(8)
         .k(k)
-        .a_stride(23)
+        .a_stride(43)
         .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_subtile) {
+  TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 4; k <= 20; k += 2) {
+    for (size_t k = 12; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1773,7 +1818,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1790,7 +1835,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1808,7 +1853,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1817,7 +1862,7 @@
           .m(4)
           .n(n)
           .k(k)
-          .a_stride(13)
+          .a_stride(23)
           .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
       }
     }
@@ -1826,7 +1871,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
@@ -1846,7 +1891,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1863,7 +1908,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1881,7 +1926,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1890,7 +1935,7 @@
           .m(4)
           .n(n)
           .k(k)
-          .a_stride(13)
+          .a_stride(23)
           .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
       }
     }
@@ -1899,7 +1944,7 @@
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
@@ -1918,7 +1963,7 @@
 
   TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k <= 10; k += 3) {
+    for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1946,7 +1991,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .qmin(128)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
@@ -1960,7 +2005,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .qmax(128)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
@@ -1974,7 +2019,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .cm_stride(11)
       .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
diff --git a/test/f32-gemm.yaml b/test/f32-gemm.yaml
index f62d47f..5a98c5e 100644
--- a/test/f32-gemm.yaml
+++ b/test/f32-gemm.yaml
@@ -15,7 +15,8 @@
   pipelined: true
   assembly: true
 - name: xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53
-  k-block: 2
+  k-block: 4
+  pipelined: true
   assembly: true
 - name: xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57
   k-block: 8
diff --git a/test/f32-gemminc.cc b/test/f32-gemminc.cc
index 544dbdf..d65f52c 100644
--- a/test/f32-gemminc.cc
+++ b/test/f32-gemminc.cc
@@ -1526,7 +1526,7 @@
 
 
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -1535,7 +1535,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
 
@@ -1548,12 +1548,12 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .cn_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     GemmMicrokernelTester()
       .mr(4)
@@ -1562,12 +1562,12 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
-      .a_stride(5)
+      .k(4)
+      .a_stride(7)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 8; n++) {
@@ -1578,14 +1578,14 @@
           .sr(1)
           .m(m)
           .n(n)
-          .k(2)
+          .k(4)
           .iterations(1)
           .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
       }
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_m) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -1595,13 +1595,13 @@
         .sr(1)
         .m(m)
         .n(8)
-        .k(2)
+        .k(4)
         .iterations(1)
         .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_2_subtile_n) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 1; n <= 8; n++) {
       GemmMicrokernelTester()
@@ -1611,15 +1611,60 @@
         .sr(1)
         .m(4)
         .n(n)
-        .k(2)
+        .k(4)
         .iterations(1)
         .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k < 2; k++) {
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
+  }
+
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
+  }
+
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
+      }
+    }
+  }
+
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON_FMA;
+    for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1632,9 +1677,9 @@
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k < 2; k++) {
+    for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1643,14 +1688,14 @@
         .m(4)
         .n(8)
         .k(k)
-        .a_stride(5)
+        .a_stride(11)
         .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_2_subtile) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k < 2; k++) {
+    for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1668,9 +1713,9 @@
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 3; k < 4; k++) {
+    for (size_t k = 9; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1683,9 +1728,9 @@
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 3; k < 4; k++) {
+    for (size_t k = 9; k < 8; k++) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1694,14 +1739,14 @@
         .m(4)
         .n(8)
         .k(k)
-        .a_stride(7)
+        .a_stride(11)
         .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_2_subtile) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 3; k < 4; k++) {
+    for (size_t k = 9; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1719,9 +1764,9 @@
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 4; k <= 20; k += 2) {
+    for (size_t k = 12; k <= 40; k += 4) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1734,9 +1779,9 @@
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_strided_a) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 4; k <= 20; k += 2) {
+    for (size_t k = 12; k <= 40; k += 4) {
       GemmMicrokernelTester()
         .mr(4)
         .nr(8)
@@ -1745,14 +1790,14 @@
         .m(4)
         .n(8)
         .k(k)
-        .a_stride(23)
+        .a_stride(43)
         .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
     }
   }
 
-  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_2_subtile) {
+  TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 4; k <= 20; k += 2) {
+    for (size_t k = 12; k <= 40; k += 4) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1773,7 +1818,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1790,7 +1835,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1808,7 +1853,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1817,7 +1862,7 @@
           .m(4)
           .n(n)
           .k(k)
-          .a_stride(13)
+          .a_stride(23)
           .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
       }
     }
@@ -1826,7 +1871,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 9; n < 16; n++) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
@@ -1846,7 +1891,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1863,7 +1908,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1881,7 +1926,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         GemmMicrokernelTester()
           .mr(4)
           .nr(8)
@@ -1890,7 +1935,7 @@
           .m(4)
           .n(n)
           .k(k)
-          .a_stride(13)
+          .a_stride(23)
           .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
       }
     }
@@ -1899,7 +1944,7 @@
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
     for (uint32_t n = 16; n <= 24; n += 8) {
-      for (size_t k = 1; k <= 10; k += 3) {
+      for (size_t k = 1; k <= 20; k += 5) {
         for (uint32_t m = 1; m <= 4; m++) {
           GemmMicrokernelTester()
             .mr(4)
@@ -1918,7 +1963,7 @@
 
   TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON_FMA;
-    for (size_t k = 1; k <= 10; k += 3) {
+    for (size_t k = 1; k <= 20; k += 5) {
       for (uint32_t m = 1; m <= 4; m++) {
         for (uint32_t n = 1; n <= 8; n++) {
           GemmMicrokernelTester()
@@ -1946,7 +1991,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .qmin(128)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
@@ -1960,7 +2005,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .qmax(128)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
@@ -1974,7 +2019,7 @@
       .sr(1)
       .m(4)
       .n(8)
-      .k(2)
+      .k(4)
       .cm_stride(11)
       .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
   }
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index 91b4f5b..4e52c3d 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -15,7 +15,8 @@
   pipelined: true
   assembly: true
 - name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53
-  k-block: 2
+  k-block: 4
+  pipelined: true
   assembly: true
 - name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57
   k-block: 8