CMP 2 instructions earlier in A/C clamping.
Slightly faster and allows x0 to be freed up.

Was
sgemm_6x8__aarch64_neonfma_cortex_a53                    37309136         21
sgemm_4x8__aarch64_neonfma_cortex_a53                    43076234         21
sgemm_6x8__aarch64_neonfma_cortex_a57                    43374253         21
sgemm_4x12__aarch64_neonfma_cortex_a53                   47712927         21
sgemm_6x8__aarch64_neonfma_ld64                          53956872         21
sgemm_6x8__aarch64_neonfma_ld128                         55084005         21
sgemm_4x8__aarch64_neonfma_cortex_a75                    57447581         21
sgemm_4x8__aarch64_neonfma_ld64                          60568062         21
sgemm_4x8__aarch64_neonfma_ld128                         61570317         21
sgemm_4x8__aarch64_neonfma_cortex_a57                    63953086         21
sgemm_6x8__aarch64_neonfma_cortex_a73                    66992046         21
sgemm_6x8__aarch64_neonfma_cortex_a75                    67444127         21

Now
sgemm_6x8__aarch64_neonfma_cortex_a53                    37199845         21
sgemm_4x8__aarch64_neonfma_cortex_a53                    43227426         21
sgemm_6x8__aarch64_neonfma_cortex_a57                    43374897         21
sgemm_4x12__aarch64_neonfma_cortex_a53                   47529428         21
sgemm_6x8__aarch64_neonfma_ld64                          53905617         21
sgemm_6x8__aarch64_neonfma_ld128                         54819725         21
sgemm_4x8__aarch64_neonfma_cortex_a75                    57878139         21
sgemm_4x8__aarch64_neonfma_ld64                          60977771         21
sgemm_4x8__aarch64_neonfma_ld128                         61288543         21
sgemm_4x8__aarch64_neonfma_cortex_a57                    63972383         21
sgemm_6x8__aarch64_neonfma_cortex_a75                    66466537         21
sgemm_6x8__aarch64_neonfma_cortex_a73                    68347399         21

PiperOrigin-RevId: 280861028
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
index e84252c..bd81473 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S
@@ -61,9 +61,9 @@
 
         # Clamp A and C pointers / Save d8-d15 on stack
         STP  d8,  d9, [sp, -64]!
+        CMP x0, 2                // if mr < 2
         ADD x9, x3, x4           // a1 = a0 + a_stride
         ADD x16, x6, x7          // c1 = c0 + cm_stride
-        CMP x0, 2                // if mr < 2
         CSEL x9, x3, x9, LO      //   a1 = a0
         CSEL x16, x6, x16, LO    //   c1 = c0
 
@@ -75,9 +75,9 @@
         CSEL x17, x16, x17, LS   //   c2 = c1
 
         STP d12, d13, [sp, 32]
+        CMP x0, 4                // if mr < 4
         ADD x11, x10, x4         // a3 = a2 + a_stride
         ADD x18, x17, x7         // c3 = c2 + cm_stride
-        CMP x0, 4                // if mr < 4
         CSEL x11, x10, x11, LO   //   a3 = a2
         CSEL x18, x17, x18, LO   //   c3 = c2
 
@@ -91,9 +91,9 @@
         # Load params pointer
         LDR x8, [sp, 72]
 
+        CMP x0, 6                // if mr < 6
         ADD x4, x12, x4          // a5 = a4 + a_stride
         ADD x7, x13, x7          // c5 = c4 + cm_stride
-        CMP x0, 6                // if mr < 6
         CSEL x4, x12, x4, LO     //   a5 = a4
         CSEL x7, x13, x7, LO     //   c5 = c4