4x8 GEMM for Cortex A53

36.6% faster than previous 4x8 a53 kernel.

Based on 6x8 GEMM trimmed down to 4 rows

PiperOrigin-RevId: 280465479
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index 91b4f5b..4e52c3d 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -15,7 +15,8 @@
   pipelined: true
   assembly: true
 - name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53
-  k-block: 2
+  k-block: 4
+  pipelined: true
   assembly: true
 - name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57
   k-block: 8