Assembly GEMM kernel NC loop use SUBS instead of CMP+SUBS

PiperOrigin-RevId: 283908353
diff --git a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S
index 9e460ea..0229b7e 100644
--- a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S
@@ -421,7 +421,7 @@
         FMAX v31.4s, v31.4s, v5.4s
 
         # Store full 4 x 8
-        CMP x1, 8
+        SUBS x1, x1, 8
         B.LO 8f
 
         STP q30, q31,  [x7]
@@ -436,7 +436,6 @@
         SUB x4, x4, x3  // a -= ks
 
         # nc loop
-        SUBS x1, x1, 8
         B.HI 0b
 
         # Restore d8-d15 from stack