Replace LDP with 2 LDR for loads.
4.35% faster on Cortex A76 (Pixel 4)
1.43% faster on Cortex A55
0.17% faster on Cortex A75 (Pixel 3)
2.22% faster on Exynos M4  (Samsung S10)

PiperOrigin-RevId: 308347621
diff --git a/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in b/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
index 46ebfe8..3a9be54 100644
--- a/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
+++ b/src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in
@@ -125,23 +125,25 @@
         B.LO 5f
 
         # Main loop - 4 halffloats of A (8 bytes)
-        # 24 FMA + 6 ld64 A + 2 LDP B
+        # 24 FMA + 6 ld64 A + 4 LDR B
 1:
         LDR   d0,  [x3], 8
-        LDP  q16,  q17, [x5], 32
+        LDR  q16, [x5], 16
+        LDR  q17, [x5], 16
         LDR   d1,  [x9], 8
         LDR   d2, [x10], 8
         LDR   d3, [x11], 8
         LDR   d4, [x12], 8
         LDR   d5,  [x4], 8
-
+        SUBS x0, x0, 8
         FMLA v20.8h, v16.8h,  v0.h[0]
         FMLA v22.8h, v16.8h,  v1.h[0]
         FMLA v24.8h, v16.8h,  v2.h[0]
         FMLA v26.8h, v16.8h,  v3.h[0]
         FMLA v28.8h, v16.8h,  v4.h[0]
         FMLA v30.8h, v16.8h,  v5.h[0]
-        LDP  q18,  q19, [x5], 32
+        LDR  q18, [x5], 16
+        LDR  q19, [x5], 16
 
         FMLA v20.8h, v17.8h,  v0.h[1]
         FMLA v22.8h, v17.8h,  v1.h[1]
@@ -156,7 +158,6 @@
         FMLA v26.8h, v18.8h,  v3.h[2]
         FMLA v28.8h, v18.8h,  v4.h[2]
         FMLA v30.8h, v18.8h,  v5.h[2]
-        SUBS x0, x0, 8
 
         FMLA v20.8h, v19.8h,  v0.h[3]
         FMLA v22.8h, v19.8h,  v1.h[3]
@@ -232,7 +233,8 @@
 6:
         # Remainder- 2 halffloats of A (4 bytes)
         LDR   s0,  [x3], 4
-        LDP  q16,  q17, [x5], 32
+        LDR  q16, [x5], 16
+        LDR  q17, [x5], 16
         LDR   s1,  [x9], 4
         LDR   s2, [x10], 4
         LDR   s3, [x11], 4