PUSH lr instead of r14 in AArch32 assembly microkernels

PiperOrigin-RevId: 417660820
diff --git a/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in b/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
index 0fc9452..2565774 100644
--- a/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
+++ b/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
@@ -46,7 +46,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S
index 61cf5fe..8126484 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S
@@ -47,7 +47,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
index 787595b..9489097 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
@@ -45,7 +45,7 @@
         .fpu    neon
 #endif
         # Push 104 bytes
-        PUSH    {r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +40
+        PUSH    {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +40
         VPUSH   {d8-d15}                                     // +64 = 104
 
         LDR     r11, [sp, 112]          // c
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in
index e950662..9cb0b78 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in
@@ -46,7 +46,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in b/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in
index 52b89ae..0468216 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in
@@ -46,7 +46,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
index 07117f1..7685869 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
@@ -50,7 +50,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
index e09750c..193f8be 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
@@ -50,7 +50,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S
index 014f1f5..0edb3f2 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S
@@ -50,7 +50,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S
index 4fb692f..391ea2a 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S
@@ -50,7 +50,7 @@
 #endif
         # Push 112 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d15}                                         // +64 = 112
 
diff --git a/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in b/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in
index 4975781..c52232a 100644
--- a/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in
+++ b/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in
@@ -53,7 +53,7 @@
 
         # Push 96 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d13}                                         // +48 = 96
 
diff --git a/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S b/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
index fb729a0..8455905 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
+++ b/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
@@ -57,7 +57,7 @@
 
         # Push 96 bytes
         # r2 will be reloaded in outer loop.  r3 is ks
-        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14}  // +44
+        PUSH    {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}   // +44
         SUB     sp, sp, 4                                        // 4
         VPUSH   {d8-d13}                                         // +48 = 96