A57 branch a version of A53 kernel

The A53 kernel in its current form performs better than
the derivative of A785 with prefetch removed.  This new
kernel is LD64 based with unroll and prefetch every 64 bytes.

PiperOrigin-RevId: 277799014
diff --git a/test/f32-gemminc.yaml b/test/f32-gemminc.yaml
index 214aa33..726705e 100644
--- a/test/f32-gemminc.yaml
+++ b/test/f32-gemminc.yaml
@@ -16,7 +16,6 @@
   assembly: true
 - name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53
   k-block: 2
-  pipelined: false
   assembly: true
 - name: xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57
   k-block: 8
@@ -32,11 +31,9 @@
   assembly: true
 - name: xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53
   k-block: 4
-  pipelined: false
   assembly: true
 - name: xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57
-  k-block: 8
-  pipelined: true
+  k-block: 4
   assembly: true
 - name: xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73
   k-block: 8