Direct branch to source remainder handler for GEMM/IGEMM. All ld64, ld128 and A53 microkernels move source remainder to a branch forward (label 5) and branch directly to the handler if there is less than main loop channels. For 2 channels if there is a remainder, it is 1 channel so additional check is not required. For 4 channels if there is a remainder, and it is not 2 channels, it is 1 channel so additional check is not required. Standardize on label 4 for clamp code, 5 for remainder. Should be a small performance improvement for 1 channel, by branching directly to the remainder code, and case where there is no remainder the branch is not taken. PiperOrigin-RevId: 294549181

commit: 8155854bfecacedaf1879eba2a0f1a2223b42a75 [log] [tgz]
author: Frank Barchard <fbarchard@google.com> Tue Feb 11 16:35:26 2020 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Tue Feb 11 16:35:56 2020 -0800
tree: 40c0aa29c07052e3bd04575631c1bdaeb2017e72
parent: 79ade18026f844af50574866c4f6aaf24a0bd974 [diff] [blame]
diff --git a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
index 198884e..87d7ddb 100644
--- a/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S
+++ b/src/f32-gemm/gen-inc/4x8-aarch64-neonfma-ld128.S

@@ -75,7 +75,7 @@
 
         # Is there at least 4 floats (16 bytes)?
         SUBS x0, x2, 16  // k = kc - 16
-        B.LO 2f
+        B.LO 5f
 
         # Main loop - 4 floats of A (16 bytes)
 1:
@@ -122,52 +122,10 @@
         FMLA v31.4s, v27.4s, v3.s[3]
         B.HS 1b
 
-        # Remainder- 2 floats of A (8 bytes)
-2:
-        TBZ x0, 3, 3f
+        TST x0, 15
+        B.NE 5f
 
-        LDR d0,  [x3], 8
-        LDP q20, q21, [x5], 32
-        LDR d1, [x11], 8
-        LDR d2, [x12], 8
-        LDR d3,  [x4], 8
-        FMLA v16.4s, v20.4s, v0.s[0]
-        FMLA v17.4s, v21.4s, v0.s[0]
-        FMLA v18.4s, v20.4s, v1.s[0]
-        FMLA v19.4s, v21.4s, v1.s[0]
-        LDP q22, q23, [x5], 32
-        FMLA v28.4s, v20.4s, v2.s[0]
-        FMLA v29.4s, v21.4s, v2.s[0]
-        FMLA v30.4s, v20.4s, v3.s[0]
-        FMLA v31.4s, v21.4s, v3.s[0]
-        FMLA v16.4s, v22.4s, v0.s[1]
-        FMLA v17.4s, v23.4s, v0.s[1]
-        FMLA v18.4s, v22.4s, v1.s[1]
-        FMLA v19.4s, v23.4s, v1.s[1]
-        FMLA v28.4s, v22.4s, v2.s[1]
-        FMLA v29.4s, v23.4s, v2.s[1]
-        FMLA v30.4s, v22.4s, v3.s[1]
-        FMLA v31.4s, v23.4s, v3.s[1]
-
-        # Remainder- 1 float of A (4 bytes)
-3:
-        TBZ x0, 2, 6f
-
-        LDR s0,  [x3], 4
-        LDP q20, q21, [x5], 32
-        LDR s1, [x11], 4
-        LDR s2, [x12], 4
-        LDR s3,  [x4], 4
-        FMLA v16.4s, v20.4s, v0.s[0]
-        FMLA v17.4s, v21.4s, v0.s[0]
-        FMLA v18.4s, v20.4s, v1.s[0]
-        FMLA v19.4s, v21.4s, v1.s[0]
-        FMLA v28.4s, v20.4s, v2.s[0]
-        FMLA v29.4s, v21.4s, v2.s[0]
-        FMLA v30.4s, v20.4s, v3.s[0]
-        FMLA v31.4s, v21.4s, v3.s[0]
-
-6:
+4:
         # Clamp
         FMIN v16.4s, v16.4s, v4.4s
         SUBS x1, x1, 8
@@ -200,9 +158,58 @@
         SUB  x4,  x4, x2 // a3 -= kc
 
         B.HI 0b
-
         RET
 
+        # Remainder- 2 floats of A (8 bytes)
+5:
+        # Is there a remainder?- 2 floats of A (8 bytes)
+        TBZ x0, 3, 6f
+
+        # Remainder- 2 floats of A (8 bytes)
+        LDR d0,  [x3], 8
+        LDP q20, q21, [x5], 32
+        LDR d1, [x11], 8
+        LDR d2, [x12], 8
+        LDR d3,  [x4], 8
+        FMLA v16.4s, v20.4s, v0.s[0]
+        FMLA v17.4s, v21.4s, v0.s[0]
+        FMLA v18.4s, v20.4s, v1.s[0]
+        FMLA v19.4s, v21.4s, v1.s[0]
+        LDP q22, q23, [x5], 32
+        FMLA v28.4s, v20.4s, v2.s[0]
+        FMLA v29.4s, v21.4s, v2.s[0]
+        FMLA v30.4s, v20.4s, v3.s[0]
+        FMLA v31.4s, v21.4s, v3.s[0]
+        FMLA v16.4s, v22.4s, v0.s[1]
+        FMLA v17.4s, v23.4s, v0.s[1]
+        FMLA v18.4s, v22.4s, v1.s[1]
+        FMLA v19.4s, v23.4s, v1.s[1]
+        FMLA v28.4s, v22.4s, v2.s[1]
+        FMLA v29.4s, v23.4s, v2.s[1]
+        FMLA v30.4s, v22.4s, v3.s[1]
+        FMLA v31.4s, v23.4s, v3.s[1]
+
+        # Is there a remainder?- 1 floats of A (4 bytes)
+        TBZ x0, 2, 4b
+
+        # Remainder- 1 float of A (4 bytes)
+6:
+        LDR s0,  [x3], 4
+        LDP q20, q21, [x5], 32
+        LDR s1, [x11], 4
+        LDR s2, [x12], 4
+        LDR s3,  [x4], 4
+        FMLA v16.4s, v20.4s, v0.s[0]
+        FMLA v17.4s, v21.4s, v0.s[0]
+        FMLA v18.4s, v20.4s, v1.s[0]
+        FMLA v19.4s, v21.4s, v1.s[0]
+        FMLA v28.4s, v20.4s, v2.s[0]
+        FMLA v29.4s, v21.4s, v2.s[0]
+        FMLA v30.4s, v20.4s, v3.s[0]
+        FMLA v31.4s, v21.4s, v3.s[0]
+        B 4b
+
+
         # Store odd width
 7:
         TBZ x1, 2, 8f
commit	8155854bfecacedaf1879eba2a0f1a2223b42a75	[log] [tgz]
author	Frank Barchard <fbarchard@google.com>	Tue Feb 11 16:35:26 2020 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Tue Feb 11 16:35:56 2020 -0800
tree	40c0aa29c07052e3bd04575631c1bdaeb2017e72
parent	79ade18026f844af50574866c4f6aaf24a0bd974 [diff] [blame]