CMP 2 instructions earlier in A/C clamping.
Slightly faster and allows x0 to be freed up.
Was
sgemm_6x8__aarch64_neonfma_cortex_a53 37309136 21
sgemm_4x8__aarch64_neonfma_cortex_a53 43076234 21
sgemm_6x8__aarch64_neonfma_cortex_a57 43374253 21
sgemm_4x12__aarch64_neonfma_cortex_a53 47712927 21
sgemm_6x8__aarch64_neonfma_ld64 53956872 21
sgemm_6x8__aarch64_neonfma_ld128 55084005 21
sgemm_4x8__aarch64_neonfma_cortex_a75 57447581 21
sgemm_4x8__aarch64_neonfma_ld64 60568062 21
sgemm_4x8__aarch64_neonfma_ld128 61570317 21
sgemm_4x8__aarch64_neonfma_cortex_a57 63953086 21
sgemm_6x8__aarch64_neonfma_cortex_a73 66992046 21
sgemm_6x8__aarch64_neonfma_cortex_a75 67444127 21
Now
sgemm_6x8__aarch64_neonfma_cortex_a53 37199845 21
sgemm_4x8__aarch64_neonfma_cortex_a53 43227426 21
sgemm_6x8__aarch64_neonfma_cortex_a57 43374897 21
sgemm_4x12__aarch64_neonfma_cortex_a53 47529428 21
sgemm_6x8__aarch64_neonfma_ld64 53905617 21
sgemm_6x8__aarch64_neonfma_ld128 54819725 21
sgemm_4x8__aarch64_neonfma_cortex_a75 57878139 21
sgemm_4x8__aarch64_neonfma_ld64 60977771 21
sgemm_4x8__aarch64_neonfma_ld128 61288543 21
sgemm_4x8__aarch64_neonfma_cortex_a57 63972383 21
sgemm_6x8__aarch64_neonfma_cortex_a75 66466537 21
sgemm_6x8__aarch64_neonfma_cortex_a73 68347399 21
PiperOrigin-RevId: 280861028
diff --git a/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S b/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
index 53e6ea1..abd5f8f 100644
--- a/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
+++ b/src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S
@@ -63,9 +63,9 @@
# Clamp A and C pointers / Save d8-d15 on stack
STP d8, d9, [sp, -48]!
+ CMP x0, 2 // if mr < 2
ADD x9, x3, x4 // a1 = a0 + a_stride
ADD x16, x6, x7 // c1 = c0 + cm_stride
- CMP x0, 2 // if mr < 2
CSEL x9, x3, x9, LO // a1 = a0
CSEL x16, x6, x16, LO // c1 = c0
@@ -77,9 +77,9 @@
CSEL x17, x16, x17, LS // c2 = c1
STP d14, d15, [sp, 32]
+ CMP x0, 4 // if mr < 4
ADD x11, x10, x4 // a3 = a2 + a_stride
ADD x13, x17, x7 // c3 = c2 + cm_stride
- CMP x0, 4 // if mr < 4
CSEL x11, x10, x11, LO // a3 = a2
CSEL x13, x17, x13, LO // c3 = c2