CMP 2 instructions earlier in A/C clamping.
Slightly faster and allows x0 to be freed up.
Was
sgemm_6x8__aarch64_neonfma_cortex_a53 37309136 21
sgemm_4x8__aarch64_neonfma_cortex_a53 43076234 21
sgemm_6x8__aarch64_neonfma_cortex_a57 43374253 21
sgemm_4x12__aarch64_neonfma_cortex_a53 47712927 21
sgemm_6x8__aarch64_neonfma_ld64 53956872 21
sgemm_6x8__aarch64_neonfma_ld128 55084005 21
sgemm_4x8__aarch64_neonfma_cortex_a75 57447581 21
sgemm_4x8__aarch64_neonfma_ld64 60568062 21
sgemm_4x8__aarch64_neonfma_ld128 61570317 21
sgemm_4x8__aarch64_neonfma_cortex_a57 63953086 21
sgemm_6x8__aarch64_neonfma_cortex_a73 66992046 21
sgemm_6x8__aarch64_neonfma_cortex_a75 67444127 21
Now
sgemm_6x8__aarch64_neonfma_cortex_a53 37199845 21
sgemm_4x8__aarch64_neonfma_cortex_a53 43227426 21
sgemm_6x8__aarch64_neonfma_cortex_a57 43374897 21
sgemm_4x12__aarch64_neonfma_cortex_a53 47529428 21
sgemm_6x8__aarch64_neonfma_ld64 53905617 21
sgemm_6x8__aarch64_neonfma_ld128 54819725 21
sgemm_4x8__aarch64_neonfma_cortex_a75 57878139 21
sgemm_4x8__aarch64_neonfma_ld64 60977771 21
sgemm_4x8__aarch64_neonfma_ld128 61288543 21
sgemm_4x8__aarch64_neonfma_cortex_a57 63972383 21
sgemm_6x8__aarch64_neonfma_cortex_a75 66466537 21
sgemm_6x8__aarch64_neonfma_cortex_a73 68347399 21
PiperOrigin-RevId: 280861028
diff --git a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
index 5205a13..eb0ecc2 100644
--- a/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
+++ b/src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
@@ -61,9 +61,9 @@
BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64
# Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
ADD x9, x3, x4 // a1 = a0 + a_stride
ADD x16, x6, x7 // c1 = c0 + cm_stride
- CMP x0, 2 // if mr < 2
CSEL x9, x3, x9, LO // a1 = a0
CSEL x16, x6, x16, LO // c1 = c0
@@ -73,9 +73,9 @@
CSEL x10, x9, x10, LS // a2 = a1
CSEL x17, x16, x17, LS // c2 = c1
+ CMP x0, 4 // if mr < 4
ADD x11, x10, x4 // a3 = a2 + a_stride
ADD x18, x17, x7 // c3 = c2 + cm_stride
- CMP x0, 4 // if mr < 4
CSEL x11, x10, x11, LO // a3 = a2
CSEL x18, x17, x18, LO // c3 = c2
@@ -92,9 +92,9 @@
# Load params pointer
LDR x8, [sp, 8]
+ CMP x0, 6 // if mr < 6
ADD x4, x12, x4 // a5 = a4 + a_stride
ADD x7, x13, x7 // c5 = c4 + cm_stride
- CMP x0, 6 // if mr < 6
CSEL x4, x12, x4, LO // a5 = a4
CSEL x7, x13, x7, LO // c5 = c4