1x8 LD64 F32 GEMM

Simplified 1x8 float GEMM LD64 microkernel
And clean up prefetches in A75 kernel.

PiperOrigin-RevId: 305903285
diff --git a/test/f32-gemminc-minmax.yaml b/test/f32-gemminc-minmax.yaml
index b6f63de..20cb43a 100644
--- a/test/f32-gemminc-minmax.yaml
+++ b/test/f32-gemminc-minmax.yaml
@@ -65,6 +65,9 @@
   k-block: 4
   pipelined: true
   assembly: true
+- name: xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_ld64
+  k-block: 2
+  assembly: true
 - name: xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_ld64
   k-block: 2
   assembly: true