CMP 2 instructions earlier in A/C clamping.
Slightly faster and allows x0 to be freed up.

Was
sgemm_6x8__aarch64_neonfma_cortex_a53                    37309136         21
sgemm_4x8__aarch64_neonfma_cortex_a53                    43076234         21
sgemm_6x8__aarch64_neonfma_cortex_a57                    43374253         21
sgemm_4x12__aarch64_neonfma_cortex_a53                   47712927         21
sgemm_6x8__aarch64_neonfma_ld64                          53956872         21
sgemm_6x8__aarch64_neonfma_ld128                         55084005         21
sgemm_4x8__aarch64_neonfma_cortex_a75                    57447581         21
sgemm_4x8__aarch64_neonfma_ld64                          60568062         21
sgemm_4x8__aarch64_neonfma_ld128                         61570317         21
sgemm_4x8__aarch64_neonfma_cortex_a57                    63953086         21
sgemm_6x8__aarch64_neonfma_cortex_a73                    66992046         21
sgemm_6x8__aarch64_neonfma_cortex_a75                    67444127         21

Now
sgemm_6x8__aarch64_neonfma_cortex_a53                    37199845         21
sgemm_4x8__aarch64_neonfma_cortex_a53                    43227426         21
sgemm_6x8__aarch64_neonfma_cortex_a57                    43374897         21
sgemm_4x12__aarch64_neonfma_cortex_a53                   47529428         21
sgemm_6x8__aarch64_neonfma_ld64                          53905617         21
sgemm_6x8__aarch64_neonfma_ld128                         54819725         21
sgemm_4x8__aarch64_neonfma_cortex_a75                    57878139         21
sgemm_4x8__aarch64_neonfma_ld64                          60977771         21
sgemm_4x8__aarch64_neonfma_ld128                         61288543         21
sgemm_4x8__aarch64_neonfma_cortex_a57                    63972383         21
sgemm_6x8__aarch64_neonfma_cortex_a75                    66466537         21
sgemm_6x8__aarch64_neonfma_cortex_a73                    68347399         21

PiperOrigin-RevId: 280861028
diff --git a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S
index 8260ee8..9e460ea 100644
--- a/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S
@@ -70,16 +70,16 @@
         STP d14, d15, [sp, 64]
 
         # Clamp C pointers
-        ADD x16, x6, x7          // c1 = c0 + cm_stride
         CMP x0, 2                // if mr < 2
+        ADD x16, x6, x7          // c1 = c0 + cm_stride
         CSEL x16, x6, x16, LO    //   c1 = c0
 
         ADD x17, x16, x7         // c2 = c1 + cm_stride
                                  // if mr <= 2
         CSEL x17, x16, x17, LS   //   c2 = c1
 
-        ADD x7, x17, x7          // c3 = c2 + cm_stride
         CMP x0, 4                // if mr < 4
+        ADD x7, x17, x7          // c3 = c2 + cm_stride
         CSEL x7, x17, x7, LO     //   c3 = c2
 
 0: