QS8 Neon microkernels switch from x9 to x11 for params

PiperOrigin-RevId: 382451240
diff --git a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index 50258fd..ee36c1c 100644
--- a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -15,7 +15,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -29,7 +29,7 @@
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}_cortex_a53
 
-        LDP     x10, x9, [sp]           // cn_stride, params
+        LDP     x10, x11, [sp]           // cn_stride, params
 
         ADD     x2, x2, 7               // kc = (kc + 7) & ~7
         BIC     x2, x2, 7
@@ -192,10 +192,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v0.4s, v16.4s, v20.4s
         ADDP    v1.4s, v24.4s, v28.4s
 
@@ -203,7 +203,7 @@
         SQRDMULH v0.4s, v0.4s, v4.4s
         SQRDMULH v1.4s, v1.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         SSRA    v0.4s, v6.4s, 31
@@ -215,10 +215,10 @@
         SUBS    x1, x1, 8
         SQADD   v0.8h, v0.8h, v5.8h
         SQXTN   v0.8b, v0.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v17.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v17.16b}, [x11]
         SMAX    v0.8b, v0.8b, v1.8b
-        SUB     x9, x9, 11              // rewind params pointer
+        SUB     x11, x11, 11              // rewind params pointer
         SMIN    v0.8b, v0.8b, v17.8b
         B.LO    5f
 
diff --git a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
index 0a39bf8..f5423d3 100644
--- a/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/1x8c8-aarch64-neon-mlal-padal.S.in
@@ -15,7 +15,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -28,7 +28,7 @@
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal${"_prfm" if PREFETCH else ""}
 
-        LDP     x10, x9, [sp]           // cn_stride, params
+        LDP     x10, x11, [sp]           // cn_stride, params
 
         ADD     x2, x2, 7               // kc = (kc + 7) & ~7
         BIC     x2, x2, 7
@@ -167,10 +167,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v0.4s, v16.4s, v20.4s
         ADDP    v1.4s, v24.4s, v28.4s
 
@@ -178,7 +178,7 @@
         SQRDMULH v0.4s, v0.4s, v4.4s
         SQRDMULH v1.4s, v1.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         SSRA    v0.4s, v6.4s, 31
@@ -190,10 +190,10 @@
         SUBS    x1, x1, 8
         SQADD   v0.8h, v0.8h, v5.8h
         SQXTN   v0.8b, v0.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v17.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v17.16b}, [x11]
         SMAX    v0.8b, v0.8b, v1.8b
-        SUB     x9, x9, 11              // rewind params pointer
+        SUB     x11, x11, 11              // rewind params pointer
         SMIN    v0.8b, v0.8b, v17.8b
         B.LO    5f
 
diff --git a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
index b0e0701..d021f8b 100644
--- a/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in
@@ -15,7 +15,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -58,7 +58,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 48]       // cn_stride, params
+        LDP     x10, x11, [sp, 48]       // cn_stride, params
         MOV     v31.16b, v30.16b
 
         # Main loop - 16 bytes of A
@@ -126,10 +126,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -145,7 +145,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -167,8 +167,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    2f
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
index cb0a54f..a511be2 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal-cortex-a53.S.in
@@ -15,7 +15,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -61,7 +61,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 80]       // cn_stride, params
+        LDP     x10, x11, [sp, 80]       // cn_stride, params
         MOV     v31.16b, v30.16b
         # Is there at least 16 bytes for epilogue?
         B.LO    4f
@@ -269,10 +269,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -288,7 +288,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -310,8 +310,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
index 398602d..fc64438 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in
@@ -15,7 +15,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -59,7 +59,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 64]       // cn_stride, params
+        LDP     x10, x11, [sp, 64]       // cn_stride, params
         MOV     v31.16b, v30.16b
         # Is there at least 16 bytes for epilogue?
         B.LO    4f
@@ -223,10 +223,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -242,7 +242,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -264,8 +264,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
diff --git a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
index b9284db..fb81b10 100644
--- a/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
+++ b/src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in
@@ -15,7 +15,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -58,7 +58,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 48]       // cn_stride, params
+        LDP     x10, x11, [sp, 48]       // cn_stride, params
         MOV     v31.16b, v30.16b
 
         # Main loop - 8 bytes of A
@@ -109,10 +109,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -128,7 +128,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -150,8 +150,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    2f
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index 9d71257..822106a 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -33,7 +33,7 @@
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_cortex_a53
 
-        LDP     x10, x9, [sp]           // cn_stride, params
+        LDP     x10, x11, [sp]           // cn_stride, params
 
         ADD     x2, x2, 7               // kc = (kc + 7) & ~7
         BIC     x2, x2, 7
@@ -190,10 +190,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v0.4s, v16.4s, v20.4s
         ADDP    v1.4s, v24.4s, v28.4s
 
@@ -201,7 +201,7 @@
         SQRDMULH v0.4s, v0.4s, v4.4s
         SQRDMULH v1.4s, v1.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         SSRA    v0.4s, v6.4s, 31
@@ -213,10 +213,10 @@
         SUBS    x1, x1, 8
         SQADD   v0.8h, v0.8h, v5.8h
         SQXTN   v0.8b, v0.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v17.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v17.16b}, [x11]
         SMAX    v0.8b, v0.8b, v1.8b
-        SUB     x9, x9, 11              // rewind params pointer
+        SUB     x11, x11, 11              // rewind params pointer
         SMIN    v0.8b, v0.8b, v17.8b
         B.LO    5f
 
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index 4f09b75..ff83326 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -33,7 +33,7 @@
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm_cortex_a53
 
-        LDP     x10, x9, [sp]           // cn_stride, params
+        LDP     x10, x11, [sp]           // cn_stride, params
 
         ADD     x2, x2, 7               // kc = (kc + 7) & ~7
         BIC     x2, x2, 7
@@ -193,10 +193,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v0.4s, v16.4s, v20.4s
         ADDP    v1.4s, v24.4s, v28.4s
 
@@ -204,7 +204,7 @@
         SQRDMULH v0.4s, v0.4s, v4.4s
         SQRDMULH v1.4s, v1.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         SSRA    v0.4s, v6.4s, 31
@@ -216,10 +216,10 @@
         SUBS    x1, x1, 8
         SQADD   v0.8h, v0.8h, v5.8h
         SQXTN   v0.8b, v0.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v17.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v17.16b}, [x11]
         SMAX    v0.8b, v0.8b, v1.8b
-        SUB     x9, x9, 11              // rewind params pointer
+        SUB     x11, x11, 11              // rewind params pointer
         SMIN    v0.8b, v0.8b, v17.8b
         B.LO    5f
 
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index bfb0659..32f8527 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -32,7 +32,7 @@
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal_prfm
 
-        LDP     x10, x9, [sp]           // cn_stride, params
+        LDP     x10, x11, [sp]           // cn_stride, params
 
         ADD     x2, x2, 7               // kc = (kc + 7) & ~7
         BIC     x2, x2, 7
@@ -165,10 +165,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v0.4s, v16.4s, v20.4s
         ADDP    v1.4s, v24.4s, v28.4s
 
@@ -176,7 +176,7 @@
         SQRDMULH v0.4s, v0.4s, v4.4s
         SQRDMULH v1.4s, v1.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         SSRA    v0.4s, v6.4s, 31
@@ -188,10 +188,10 @@
         SUBS    x1, x1, 8
         SQADD   v0.8h, v0.8h, v5.8h
         SQXTN   v0.8b, v0.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v17.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v17.16b}, [x11]
         SMAX    v0.8b, v0.8b, v1.8b
-        SUB     x9, x9, 11              // rewind params pointer
+        SUB     x11, x11, 11              // rewind params pointer
         SMIN    v0.8b, v0.8b, v17.8b
         B.LO    5f
 
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index 7c20d89..1976677 100644
--- a/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          (x7)
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d2-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -32,7 +32,7 @@
 
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__aarch64_neon_mlal_padal
 
-        LDP     x10, x9, [sp]           // cn_stride, params
+        LDP     x10, x11, [sp]           // cn_stride, params
 
         ADD     x2, x2, 7               // kc = (kc + 7) & ~7
         BIC     x2, x2, 7
@@ -159,10 +159,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v0.4s, v16.4s, v20.4s
         ADDP    v1.4s, v24.4s, v28.4s
 
@@ -170,7 +170,7 @@
         SQRDMULH v0.4s, v0.4s, v4.4s
         SQRDMULH v1.4s, v1.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         SSRA    v0.4s, v6.4s, 31
@@ -182,10 +182,10 @@
         SUBS    x1, x1, 8
         SQADD   v0.8h, v0.8h, v5.8h
         SQXTN   v0.8b, v0.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v17.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v17.16b}, [x11]
         SMAX    v0.8b, v0.8b, v1.8b
-        SUB     x9, x9, 11              // rewind params pointer
+        SUB     x11, x11, 11              // rewind params pointer
         SMIN    v0.8b, v0.8b, v17.8b
         B.LO    5f
 
diff --git a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index e81d5d9..670bc2f 100644
--- a/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -62,7 +62,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 48]       // cn_stride, params
+        LDP     x10, x11, [sp, 48]       // cn_stride, params
         MOV     v31.16b, v30.16b
 
         # Main loop - 16 bytes of A
@@ -130,10 +130,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -149,7 +149,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -171,8 +171,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    2f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
index 0d6f604..89156f0 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -65,7 +65,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 80]       // cn_stride, params
+        LDP     x10, x11, [sp, 80]       // cn_stride, params
         MOV     v31.16b, v30.16b
         # Is there at least 16 bytes for epilogue?
         B.LO    4f
@@ -265,10 +265,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -284,7 +284,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -306,8 +306,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
index c41b6ab..32f43f6 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -65,7 +65,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 80]       // cn_stride, params
+        LDP     x10, x11, [sp, 80]       // cn_stride, params
         MOV     v31.16b, v30.16b
         # Is there at least 16 bytes for epilogue?
         B.LO    4f
@@ -269,10 +269,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -288,7 +288,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -310,8 +310,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
index 9a01aa4..baa87ee 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -63,7 +63,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 64]       // cn_stride, params
+        LDP     x10, x11, [sp, 64]       // cn_stride, params
         MOV     v31.16b, v30.16b
         # Is there at least 16 bytes for epilogue?
         B.LO    4f
@@ -223,10 +223,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -242,7 +242,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -264,8 +264,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
index e26b04f..a678efd 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -63,7 +63,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 64]       // cn_stride, params
+        LDP     x10, x11, [sp, 64]       // cn_stride, params
         MOV     v31.16b, v30.16b
         # Is there at least 16 bytes for epilogue?
         B.LO    4f
@@ -219,10 +219,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -238,7 +238,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -260,8 +260,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    5f
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
index a4ea595..8a7e67e 100644
--- a/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
+++ b/src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
@@ -19,7 +19,7 @@
 #     int8_t* restrict c,        x6
 #     size_t cm_stride,          x7
 #     size_t cn_stride,          [sp] -> x10
-#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x9
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
 
 # d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
 
@@ -62,7 +62,7 @@
         MOV     v27.16b, v26.16b
         LDP     s28, s30, [x5], 8
         MOV     v29.16b, v28.16b
-        LDP     x10, x9, [sp, 48]       // cn_stride, params
+        LDP     x10, x11, [sp, 48]       // cn_stride, params
         MOV     v31.16b, v30.16b
 
         # Main loop - 8 bytes of A
@@ -113,10 +113,10 @@
         # Add columns
         ADDP    v16.4s, v16.4s, v18.4s
         ADDP    v20.4s, v20.4s, v22.4s
-        LD1R    {v4.4s}, [x9], 4
+        LD1R    {v4.4s}, [x11], 4
         ADDP    v24.4s, v24.4s, v26.4s
         ADDP    v28.4s, v28.4s, v30.4s
-        LD1R    {v7.4s}, [x9], 4
+        LD1R    {v7.4s}, [x11], 4
         ADDP    v17.4s, v17.4s, v19.4s
         ADDP    v21.4s, v21.4s, v23.4s
         ADDP    v25.4s, v25.4s, v27.4s
@@ -132,7 +132,7 @@
         SQRDMULH v2.4s, v2.4s, v4.4s
         SQRDMULH v3.4s, v3.4s, v4.4s
         CMEQ    v4.4s, v7.4s, 0
-        LD1R    {v5.8h}, [x9], 2
+        LD1R    {v5.8h}, [x11], 2
         BIC     v6.16b, v0.16b, v4.16b
         BIC     v16.16b, v1.16b, v4.16b
         BIC     v17.16b, v2.16b, v4.16b
@@ -154,8 +154,8 @@
         SQADD   v1.8h, v2.8h, v5.8h
         SQXTN   v0.8b, v0.8h
         SQXTN2  v0.16b, v1.8h
-        LD1R    {v1.16b}, [x9], 1
-        LD1R    {v2.16b}, [x9]
+        LD1R    {v1.16b}, [x11], 1
+        LD1R    {v2.16b}, [x11]
         SMAX    v0.16b, v0.16b, v1.16b
         SMIN    v0.16b, v0.16b, v2.16b
         B.LO    2f