QS8 Neon microkernels switch from x9 to x11 for params
PiperOrigin-RevId: 382451240
diff --git a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index 50258fd..ee36c1c 100644
--- a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -29,7 +29,7 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
- LDP x10, x9, [sp] // cn_stride, params
+ LDP x10, x11, [sp] // cn_stride, params
ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
@@ -192,10 +192,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s
@@ -203,7 +203,7 @@
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
@@ -215,10 +215,10 @@
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v17.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
- SUB x9, x9, 11 // rewind params pointer
+ SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f
diff --git a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
index 0a39bf8..f5423d3 100644
--- a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
@@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -28,7 +28,7 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
- LDP x10, x9, [sp] // cn_stride, params
+ LDP x10, x11, [sp] // cn_stride, params
ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
@@ -167,10 +167,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s
@@ -178,7 +178,7 @@
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
@@ -190,10 +190,10 @@
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v17.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
- SUB x9, x9, 11 // rewind params pointer
+ SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f
diff --git a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
index b0e0701..d021f8b 100644
--- a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
@@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -58,7 +58,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 48] // cn_stride, params
+ LDP x10, x11, [sp, 48] // cn_stride, params
MOV v31.16b, v30.16b
# Main loop - 16 bytes of A
@@ -126,10 +126,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -145,7 +145,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -167,8 +167,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 2f
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index cb0a54f..a511be2 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -61,7 +61,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 80] // cn_stride, params
+ LDP x10, x11, [sp, 80] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
@@ -269,10 +269,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -288,7 +288,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -310,8 +310,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
index 398602d..fc64438 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
@@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -59,7 +59,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 64] // cn_stride, params
+ LDP x10, x11, [sp, 64] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
@@ -223,10 +223,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -242,7 +242,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -264,8 +264,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
index b9284db..fb81b10 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
@@ -15,7 +15,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -58,7 +58,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 48] // cn_stride, params
+ LDP x10, x11, [sp, 48] // cn_stride, params
MOV v31.16b, v30.16b
# Main loop - 8 bytes of A
@@ -109,10 +109,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -128,7 +128,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -150,8 +150,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 2f
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index 9d71257..822106a 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -33,7 +33,7 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
- LDP x10, x9, [sp] // cn_stride, params
+ LDP x10, x11, [sp] // cn_stride, params
ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
@@ -190,10 +190,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s
@@ -201,7 +201,7 @@
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
@@ -213,10 +213,10 @@
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v17.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
- SUB x9, x9, 11 // rewind params pointer
+ SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index 4f09b75..ff83326 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -33,7 +33,7 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
- LDP x10, x9, [sp] // cn_stride, params
+ LDP x10, x11, [sp] // cn_stride, params
ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
@@ -193,10 +193,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s
@@ -204,7 +204,7 @@
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
@@ -216,10 +216,10 @@
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v17.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
- SUB x9, x9, 11 // rewind params pointer
+ SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index bfb0659..32f8527 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -32,7 +32,7 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
- LDP x10, x9, [sp] // cn_stride, params
+ LDP x10, x11, [sp] // cn_stride, params
ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
@@ -165,10 +165,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s
@@ -176,7 +176,7 @@
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
@@ -188,10 +188,10 @@
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v17.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
- SUB x9, x9, 11 // rewind params pointer
+ SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index 7c20d89..1976677 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -32,7 +32,7 @@
BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
- LDP x10, x9, [sp] // cn_stride, params
+ LDP x10, x11, [sp] // cn_stride, params
ADD x2, x2, 7 // kc = (kc + 7) & ~7
BIC x2, x2, 7
@@ -159,10 +159,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v0.4s, v16.4s, v20.4s
ADDP v1.4s, v24.4s, v28.4s
@@ -170,7 +170,7 @@
SQRDMULH v0.4s, v0.4s, v4.4s
SQRDMULH v1.4s, v1.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
SSRA v0.4s, v6.4s, 31
@@ -182,10 +182,10 @@
SUBS x1, x1, 8
SQADD v0.8h, v0.8h, v5.8h
SQXTN v0.8b, v0.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v17.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v17.16b}, [x11]
SMAX v0.8b, v0.8b, v1.8b
- SUB x9, x9, 11 // rewind params pointer
+ SUB x11, x11, 11 // rewind params pointer
SMIN v0.8b, v0.8b, v17.8b
B.LO 5f
diff --git a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index e81d5d9..670bc2f 100644
--- a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -62,7 +62,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 48] // cn_stride, params
+ LDP x10, x11, [sp, 48] // cn_stride, params
MOV v31.16b, v30.16b
# Main loop - 16 bytes of A
@@ -130,10 +130,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -149,7 +149,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -171,8 +171,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 2f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index 0d6f604..89156f0 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -65,7 +65,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 80] // cn_stride, params
+ LDP x10, x11, [sp, 80] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
@@ -265,10 +265,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -284,7 +284,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -306,8 +306,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index c41b6ab..32f43f6 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -65,7 +65,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 80] // cn_stride, params
+ LDP x10, x11, [sp, 80] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
@@ -269,10 +269,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -288,7 +288,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -310,8 +310,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index 9a01aa4..baa87ee 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -63,7 +63,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 64] // cn_stride, params
+ LDP x10, x11, [sp, 64] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
@@ -223,10 +223,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -242,7 +242,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -264,8 +264,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index e26b04f..a678efd 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -63,7 +63,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 64] // cn_stride, params
+ LDP x10, x11, [sp, 64] // cn_stride, params
MOV v31.16b, v30.16b
# Is there at least 16 bytes for epilogue?
B.LO 4f
@@ -219,10 +219,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -238,7 +238,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -260,8 +260,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
index a4ea595..8a7e67e 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
@@ -19,7 +19,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x10
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x9
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -62,7 +62,7 @@
MOV v27.16b, v26.16b
LDP s28, s30, [x5], 8
MOV v29.16b, v28.16b
- LDP x10, x9, [sp, 48] // cn_stride, params
+ LDP x10, x11, [sp, 48] // cn_stride, params
MOV v31.16b, v30.16b
# Main loop - 8 bytes of A
@@ -113,10 +113,10 @@
# Add columns
ADDP v16.4s, v16.4s, v18.4s
ADDP v20.4s, v20.4s, v22.4s
- LD1R {v4.4s}, [x9], 4
+ LD1R {v4.4s}, [x11], 4
ADDP v24.4s, v24.4s, v26.4s
ADDP v28.4s, v28.4s, v30.4s
- LD1R {v7.4s}, [x9], 4
+ LD1R {v7.4s}, [x11], 4
ADDP v17.4s, v17.4s, v19.4s
ADDP v21.4s, v21.4s, v23.4s
ADDP v25.4s, v25.4s, v27.4s
@@ -132,7 +132,7 @@
SQRDMULH v2.4s, v2.4s, v4.4s
SQRDMULH v3.4s, v3.4s, v4.4s
CMEQ v4.4s, v7.4s, 0
- LD1R {v5.8h}, [x9], 2
+ LD1R {v5.8h}, [x11], 2
BIC v6.16b, v0.16b, v4.16b
BIC v16.16b, v1.16b, v4.16b
BIC v17.16b, v2.16b, v4.16b
@@ -154,8 +154,8 @@
SQADD v1.8h, v2.8h, v5.8h
SQXTN v0.8b, v0.8h
SQXTN2 v0.16b, v1.8h
- LD1R {v1.16b}, [x9], 1
- LD1R {v2.16b}, [x9]
+ LD1R {v1.16b}, [x11], 1
+ LD1R {v2.16b}, [x11]
SMAX v0.16b, v0.16b, v1.16b
SMIN v0.16b, v0.16b, v2.16b
B.LO 2f