PUSH lr instead of r14 in AArch32 assembly microkernels
PiperOrigin-RevId: 417660820
diff --git a/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in b/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
index 0fc9452..2565774 100644
--- a/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
+++ b/src/f32-igemm/4x8-aarch32-neon-cortex-a75.S.in
@@ -46,7 +46,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S
index 61cf5fe..8126484 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a53.S
@@ -47,7 +47,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
index 787595b..9489097 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a55.S
@@ -45,7 +45,7 @@
.fpu neon
#endif
# Push 104 bytes
- PUSH {r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +40
+ PUSH {r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +40
VPUSH {d8-d15} // +64 = 104
LDR r11, [sp, 112] // c
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in
index e950662..9cb0b78 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-cortex-a7.S.in
@@ -46,7 +46,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in b/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in
index 52b89ae..0468216 100644
--- a/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in
+++ b/src/f32-igemm/4x8-minmax-aarch32-neon-ld64.S.in
@@ -46,7 +46,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
index 07117f1..7685869 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a7.S
@@ -50,7 +50,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
index e09750c..193f8be 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-cortex-a75.S
@@ -50,7 +50,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S
index 014f1f5..0edb3f2 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-ld64.S
@@ -50,7 +50,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S
index 4fb692f..391ea2a 100644
--- a/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S
+++ b/src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S
@@ -50,7 +50,7 @@
#endif
# Push 112 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d15} // +64 = 112
diff --git a/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in b/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in
index 4975781..c52232a 100644
--- a/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in
+++ b/src/qs8-igemm/4x8c4-aarch32-neondot-ld64.S.in
@@ -53,7 +53,7 @@
# Push 96 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d13} // +48 = 96
diff --git a/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S b/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
index fb729a0..8455905 100644
--- a/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
+++ b/src/qs8-igemm/gen/4x8c4-minmax-rndnu-aarch32-neondot-ld64.S
@@ -57,7 +57,7 @@
# Push 96 bytes
# r2 will be reloaded in outer loop. r3 is ks
- PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
+ PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr} // +44
SUB sp, sp, 4 // 4
VPUSH {d8-d13} // +48 = 96