blob: 828d058be31c4091b29ad61b365a188c6f8743eb [file] [log] [blame]
Frank Barchard13916042019-12-11 10:56:34 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8.syntax unified
9
10// void xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53(
11// size_t mr, r0
Frank Barchard54afb132020-01-10 19:12:28 -080012// size_t nc, r1
13// size_t kc, r2 -> r5 -> sp + 0
Frank Barchard13916042019-12-11 10:56:34 -080014// const uint8_t*restrict a, r3
Frank Barchard54afb132020-01-10 19:12:28 -080015// size_t a_stride, sp + 100 -> (r7)
16// const void*restrict w, sp + 104 -> r9
17// uint8_t*restrict c, sp + 108 -> r11
18// size_t cm_stride, sp + 112 -> (r6)
19// size_t cn_stride, sp + 116 -> (r0)
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070020// const union xnn_f32_minmax_params params[restrict static 1]) sp + 120 -> (r5)
Frank Barchard13916042019-12-11 10:56:34 -080021
22
23// inner loop registers
Frank Barchard54afb132020-01-10 19:12:28 -080024// r0, r2 scratch temporaries for loads
25// r14 (lr) unused
Frank Barchard13916042019-12-11 10:56:34 -080026
27// A0 r3 d0
28// A1 r12 d1
29// A2 r10 d2
Frank Barchard54afb132020-01-10 19:12:28 -080030// A3 r7 d3
Frank Barchard13916042019-12-11 10:56:34 -080031
32// B r9 d8, d9, d10, d11
33// B d12, d13, d14, d15
34
35// C0 r11 d16-d17 q8 d18-d19 q9
36// C1 r4 d20-d21 q10 d22-d23 q11
37// C2 r8 d24-d25 q12 d26-d27 q13
38// C3 r6 d28-d29 q14 d30-d31 q15
39
40// Clamp (r5) d4 d5 d6 d7
41
42BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53
43 .arm
44#ifndef __APPLE__
45 .arch armv7-a
46 .fpu neon
47#endif
Frank Barchard54afb132020-01-10 19:12:28 -080048 // Push 100 bytes
49 // r2 will be reloaded in outer loop
50 VPUSH {d8-d15} // 64
51 PUSH {r2, r4, r5, r6, r7, r8, r9, r10, r11} // +36 = 100
Frank Barchard13916042019-12-11 10:56:34 -080052
Frank Barchard54afb132020-01-10 19:12:28 -080053 LDR r7, [sp, 100] // a_stride
Frank Barchard54afb132020-01-10 19:12:28 -080054 LDR r11, [sp, 108] // c
55 LDR r6, [sp, 112] // cm_stride
Frank Barchard9b499d62020-02-12 11:05:00 -080056 LDR r9, [sp, 104] // w
Frank Barchard13916042019-12-11 10:56:34 -080057
58 // Clamp A and C pointers
59 CMP r0, 2 // if mr >= 2
60 ADD r12, r3, r7 // a1 = a0 + a_stride
61 ADD r4, r11, r6 // c1 = c0 + cm_stride
62 MOVLO r12, r3 // a1
63 MOVLO r4, r11 // c1
64 // if mr > 2
65 ADD r10, r12, r7 // a2 = a1 + a_stride
66 ADD r8, r4, r6 // c2 = c1 + cm_stride
67 MOVLS r10, r12 // a2
68 MOVLS r8, r4 // c2
69
70 CMP r0, 4 // if mr >=4
Frank Barchard54afb132020-01-10 19:12:28 -080071 ADD r7, r10, r7 // a3 = a2 + a_stride
Frank Barchard13916042019-12-11 10:56:34 -080072 ADD r6, r8, r6 // c3 = c2 + cm_stride
Frank Barchard54afb132020-01-10 19:12:28 -080073 MOVLO r7, r10 // a3
Frank Barchard13916042019-12-11 10:56:34 -080074 MOVLO r6, r8 // c3
75
Frank Barchard13916042019-12-11 10:56:34 -080076 .p2align 3
771:
78 # Load initial bias from w into accumulators
79 VLDM r9!, {d16-d19} // Bias
Frank Barchard005feb82019-12-16 18:16:55 -080080
81 SUBS r5, r2, 16 // kc - 16
Frank Barchard13916042019-12-11 10:56:34 -080082 PLD [r3, 0] // Prefetch A
83 PLD [r3, 64]
Frank Barchard54afb132020-01-10 19:12:28 -080084 VMOV q10, q8
85 PLD [r12, 0]
86 PLD [r12, 64]
87 VMOV q11, q9
88 PLD [r10, 0]
89 PLD [r10, 64]
90 VMOV q12, q8
91 PLD [r7, 0]
92 PLD [r7, 64]
93 VMOV q13, q9
Frank Barchard13916042019-12-11 10:56:34 -080094 PLD [r9, 0] // Prefetch B
95 PLD [r9, 64]
Frank Barchard54afb132020-01-10 19:12:28 -080096 VMOV q14, q8
Frank Barchard13916042019-12-11 10:56:34 -080097 PLD [r9, 128]
98 PLD [r9, 192]
Frank Barchard54afb132020-01-10 19:12:28 -080099 VMOV q15, q9
Frank Barchard13916042019-12-11 10:56:34 -0800100 PLD [r9, 256]
101 PLD [r9, 320]
Frank Barchard9b499d62020-02-12 11:05:00 -0800102 BLO 5f // less than 4 channels?
Frank Barchard13916042019-12-11 10:56:34 -0800103
Frank Barchardc01d8a42020-01-08 16:33:27 -0800104 // Prologue
Frank Barchard54afb132020-01-10 19:12:28 -0800105 VLD1.32 {d0}, [r3]! // A0
106 VLD1.32 {d1}, [r12]! // A1
107 VLD1.32 {d2}, [r10]! // A2
108 VLD1.32 {d3}, [r7]! // A3
Frank Barchard13916042019-12-11 10:56:34 -0800109 SUBS r5, r5, 16
Frank Barchardc01d8a42020-01-08 16:33:27 -0800110 VLDM r9, {d8-d11} // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800111 LDR r0, [r9, 56] // B1 low VMOV is in BLOCK 0
Frank Barchardc01d8a42020-01-08 16:33:27 -0800112 LDR r2, [r9, 60] // B1 high
Frank Barchard54afb132020-01-10 19:12:28 -0800113 VLDR d13, [r9, 40] // B1
Frank Barchardc01d8a42020-01-08 16:33:27 -0800114
Frank Barchard13916042019-12-11 10:56:34 -0800115 BLO 3f // less than 4 channels? skip main loop
116
Frank Barchard005feb82019-12-16 18:16:55 -0800117 # Main loop - 4 floats of A (16 bytes)
118 # 32 FMA + 8 LD64 A + 8 LDR B
119 .p2align 3
Frank Barchard13916042019-12-11 10:56:34 -08001202:
Frank Barchardc01d8a42020-01-08 16:33:27 -0800121 # First group of 16 FMA, Second group loads
122 // BLOCK 0
Frank Barchard005feb82019-12-16 18:16:55 -0800123 VLD1.32 {d4}, [r3]! // A0
Frank Barchard54afb132020-01-10 19:12:28 -0800124 VMOV d15, r0, r2 // b1 VMOV b from second group
Frank Barchardc01d8a42020-01-08 16:33:27 -0800125 VMLA.F32 q8, q4, d0[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800126 LDR r0, [r12] // A1 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800127 VMLA.F32 q10, q4, d1[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800128 LDR r2, [r12, 4] // A1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800129 VMLA.F32 q12, q4, d2[0]
Frank Barchard13916042019-12-11 10:56:34 -0800130 PLD [r3, 128] // Prefetch A0
Frank Barchard005feb82019-12-16 18:16:55 -0800131
Frank Barchardc01d8a42020-01-08 16:33:27 -0800132 // BLOCK 1
133 VLDR d12, [r9, 32] // B1
Frank Barchard54afb132020-01-10 19:12:28 -0800134 VMOV d5, r0, r2 // a1 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800135 VMLA.F32 q14, q4, d3[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800136 LDR r0, [r9, 72] // B0 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800137 VMLA.F32 q9, q5, d0[0]
138 LDR r2, [r9, 76] // B0 high
139 VMLA.F32 q11, q5, d1[0]
140 PLD [r12, 128] // Prefetch A1
141
142 // BLOCK 2
143 VLD1.32 {d6}, [r10]! // A2
Frank Barchard54afb132020-01-10 19:12:28 -0800144 VMOV d9, r0, r2 // b0 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800145 VMLA.F32 q13, q5, d2[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800146 LDR r0, [r7] // A3 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800147 VMLA.F32 q15, q5, d3[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800148 LDR r2, [r7, 4] // A3 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800149 VMLA.F32 q8, q6, d0[1]
150 PLD [r10, 128] // Prefetch A2
151
152 // BLOCK 3
153 VLDR d14, [r9, 48] // B1
Frank Barchard54afb132020-01-10 19:12:28 -0800154 VMOV d7, r0, r2 // a3 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800155 VMLA.F32 q10, q6, d1[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800156 LDR r0, [r9, 88] // B0 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800157 VMLA.F32 q12, q6, d2[1]
158 LDR r2, [r9, 92] // B0 high
159 VMLA.F32 q14, q6, d3[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800160 PLD [r7, 128] // Prefetch A3
Frank Barchardc01d8a42020-01-08 16:33:27 -0800161
162 // BLOCK 4
163 VLDR d8, [r9, 64] // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800164 VMOV d11, r0, r2 // B0 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800165 VMLA.F32 q9, q7, d0[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800166 LDR r0, [r9, 104] // B1 low VMOV is in BLOCK 0
Frank Barchardc01d8a42020-01-08 16:33:27 -0800167 VMLA.F32 q11, q7, d1[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800168 LDR r2, [r9, 108] // B1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800169 VMLA.F32 q13, q7, d2[1]
170 PLD [r9, 384] // Prefetch B
171
172 // BLOCK 5
173 VLDR d10, [r9, 80] // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800174 VMOV d13, r0, r2 // b1 VMOV b from second group
Frank Barchardc01d8a42020-01-08 16:33:27 -0800175 VMLA.F32 q15, q7, d3[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800176 LDR r0, [r9, 120] // B1 low VMOV is in BLOCK 0
Frank Barchardc01d8a42020-01-08 16:33:27 -0800177 NOP
Frank Barchard54afb132020-01-10 19:12:28 -0800178 LDR r2, [r9, 124] // B1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800179 NOP
180 PLD [r9, 448] // Prefetch B
181
182 # Second group of 16 FMA, First group of loads
183 // BLOCK 0
184 VLD1.32 {d0}, [r3]! // A0
Frank Barchard54afb132020-01-10 19:12:28 -0800185 VMOV d15, r0, r2 // b1 VMOV b from second group
Frank Barchardc01d8a42020-01-08 16:33:27 -0800186 VMLA.F32 q8, q4, d4[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800187 LDR r0, [r12, 8] // A1 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800188 VMLA.F32 q10, q4, d5[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800189 LDR r2, [r12, 12] // A1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800190 VMLA.F32 q12, q4, d6[0]
Frank Barchard9b499d62020-02-12 11:05:00 -0800191 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800192
193 // BLOCK 1
194 VLDR d12, [r9, 96] // B1
Frank Barchard54afb132020-01-10 19:12:28 -0800195 VMOV d1, r0, r2 // a1 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800196 VMLA.F32 q14, q4, d7[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800197 LDR r0, [r9, 136] // B0 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800198 VMLA.F32 q9, q5, d4[0]
199 LDR r2, [r9, 140] // B0 high
200 VMLA.F32 q11, q5, d5[0]
Frank Barchard9b499d62020-02-12 11:05:00 -0800201 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800202
203 // BLOCK 2
204 VLD1.32 {d2}, [r10]! // A2
Frank Barchard54afb132020-01-10 19:12:28 -0800205 VMOV d9, r0, r2 // b0 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800206 VMLA.F32 q13, q5, d6[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800207 LDR r0, [r7, 8] // A3 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800208 VMLA.F32 q15, q5, d7[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800209 LDR r2, [r7, 12] // A3 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800210 VMLA.F32 q8, q6, d4[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800211 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800212
213 // BLOCK 3
214 VLDR d14, [r9, 112] // B1
Frank Barchard54afb132020-01-10 19:12:28 -0800215 VMOV d3, r0, r2 // a3 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800216 VMLA.F32 q10, q6, d5[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800217 LDR r0, [r9, 152] // B0 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800218 VMLA.F32 q12, q6, d6[1]
219 LDR r2, [r9, 156] // B0 high
220 VMLA.F32 q14, q6, d7[1]
221 ADD r12, r12, 16 // A1++
222
223 // BLOCK 4
224 VLDR d8, [r9, 128] // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800225 VMOV d11, r0, r2 // B0 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800226 VMLA.F32 q9, q7, d4[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800227 LDR r0, [r9, 168] // B1 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800228 VMLA.F32 q11, q7, d5[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800229 LDR r2, [r9, 172] // B1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800230 VMLA.F32 q13, q7, d6[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800231 ADD r7, r7, 16 // A3++
Frank Barchardc01d8a42020-01-08 16:33:27 -0800232
233 // BLOCK 5
234 VLDR d10, [r9, 144] // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800235 VMOV d13, r0, r2 // b1 VMOV b
Frank Barchard13916042019-12-11 10:56:34 -0800236 VMLA.F32 q15, q7, d7[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800237 LDR r0, [r9, 184] // B1 low VMOV is in BLOCK 0
Frank Barchardc01d8a42020-01-08 16:33:27 -0800238 SUBS r5, r5, 16
Frank Barchard54afb132020-01-10 19:12:28 -0800239 LDR r2, [r9, 188] // B1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800240 ADD r9, r9, 128 // B++
Frank Barchard13916042019-12-11 10:56:34 -0800241 BHS 2b
242
Frank Barchardc01d8a42020-01-08 16:33:27 -0800243 # Epilogue - 4 floats of A (16 bytes)
Frank Barchard13916042019-12-11 10:56:34 -08002443:
Frank Barchardc01d8a42020-01-08 16:33:27 -0800245 # First group of 16 FMA, Second group loads
246 // BLOCK 0
Frank Barchard279908a2019-12-30 17:07:46 -0800247 VLD1.32 {d4}, [r3]! // A0
Frank Barchard54afb132020-01-10 19:12:28 -0800248 VMOV d15, r0, r2 // b1 VMOV b from second group
Frank Barchardc01d8a42020-01-08 16:33:27 -0800249 VMLA.F32 q8, q4, d0[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800250 LDR r0, [r12] // A1 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800251 VMLA.F32 q10, q4, d1[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800252 LDR r2, [r12, 4] // A1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800253 VMLA.F32 q12, q4, d2[0]
Frank Barchard9b499d62020-02-12 11:05:00 -0800254 // NOP
Frank Barchard13916042019-12-11 10:56:34 -0800255
Frank Barchardc01d8a42020-01-08 16:33:27 -0800256 // BLOCK 1
257 VLDR d12, [r9, 32] // B1
Frank Barchard54afb132020-01-10 19:12:28 -0800258 VMOV d5, r0, r2 // a1 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800259 VMLA.F32 q14, q4, d3[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800260 LDR r0, [r9, 72] // B0 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800261 VMLA.F32 q9, q5, d0[0]
262 LDR r2, [r9, 76] // B0 high
263 VMLA.F32 q11, q5, d1[0]
Frank Barchard9b499d62020-02-12 11:05:00 -0800264 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800265
266 // BLOCK 2
267 VLD1.32 {d6}, [r10]! // A2
Frank Barchard54afb132020-01-10 19:12:28 -0800268 VMOV d9, r0, r2 // b0 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800269 VMLA.F32 q13, q5, d2[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800270 LDR r0, [r7] // A3 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800271 VMLA.F32 q15, q5, d3[0]
Frank Barchard54afb132020-01-10 19:12:28 -0800272 LDR r2, [r7, 4] // A3 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800273 VMLA.F32 q8, q6, d0[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800274 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800275
276 // BLOCK 3
277 VLDR d14, [r9, 48] // B1
Frank Barchard54afb132020-01-10 19:12:28 -0800278 VMOV d7, r0, r2 // a3 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800279 VMLA.F32 q10, q6, d1[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800280 LDR r0, [r9, 88] // B0 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800281 VMLA.F32 q12, q6, d2[1]
282 LDR r2, [r9, 92] // B0 high
283 VMLA.F32 q14, q6, d3[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800284 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800285
286 // BLOCK 4
287 VLDR d8, [r9, 64] // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800288 VMOV d11, r0, r2 // B0 VMOV
Frank Barchardc01d8a42020-01-08 16:33:27 -0800289 VMLA.F32 q9, q7, d0[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800290 LDR r0, [r9, 104] // B1 low
Frank Barchardc01d8a42020-01-08 16:33:27 -0800291 VMLA.F32 q11, q7, d1[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800292 LDR r2, [r9, 108] // B1 high
Frank Barchardc01d8a42020-01-08 16:33:27 -0800293 VMLA.F32 q13, q7, d2[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800294 // NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800295
296 // BLOCK 5
297 VLDR d10, [r9, 80] // B0
Frank Barchard54afb132020-01-10 19:12:28 -0800298 VMOV d13, r0, r2 // b1 VMOV b
Frank Barchardc01d8a42020-01-08 16:33:27 -0800299 VMLA.F32 q15, q7, d3[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800300 LDR r0, [r9, 120] // B1 low VMOV is in BLOCK 0
Frank Barchardc01d8a42020-01-08 16:33:27 -0800301 NOP
Frank Barchard54afb132020-01-10 19:12:28 -0800302 LDR r2, [r9, 124] // B1 high
303 NOP
Frank Barchardc01d8a42020-01-08 16:33:27 -0800304 NOP
305
306 # Second group of 16 FMA, First group of loads
307 // BLOCK 0
Frank Barchard54afb132020-01-10 19:12:28 -0800308 VLDR d12, [r9, 96] // B1
309 VMOV d15, r0, r2 // b1 VMOV b from second group
Frank Barchard13916042019-12-11 10:56:34 -0800310 VMLA.F32 q8, q4, d4[0]
Frank Barchard13916042019-12-11 10:56:34 -0800311 VMLA.F32 q10, q4, d5[0]
Frank Barchard13916042019-12-11 10:56:34 -0800312 VMLA.F32 q12, q4, d6[0]
Frank Barchardc01d8a42020-01-08 16:33:27 -0800313
314 // BLOCK 1
Frank Barchard54afb132020-01-10 19:12:28 -0800315 VLDR d14, [r9, 112] // B1
Frank Barchard13916042019-12-11 10:56:34 -0800316 VMLA.F32 q14, q4, d7[0]
Frank Barchard279908a2019-12-30 17:07:46 -0800317 VMLA.F32 q9, q5, d4[0]
318 VMLA.F32 q11, q5, d5[0]
Frank Barchard9b499d62020-02-12 11:05:00 -0800319 ADD r12, r12, 8 // A1++
Frank Barchardc01d8a42020-01-08 16:33:27 -0800320
321 // BLOCK 2
Frank Barchard9b499d62020-02-12 11:05:00 -0800322 ADD r7, r7, 8 // A3++ VLDR B1 lands here
323 ADD r9, r9, 128 // B++
Frank Barchard279908a2019-12-30 17:07:46 -0800324 VMLA.F32 q13, q5, d6[0]
Frank Barchard13916042019-12-11 10:56:34 -0800325 VMLA.F32 q15, q5, d7[0]
326 VMLA.F32 q8, q6, d4[1]
Frank Barchardc01d8a42020-01-08 16:33:27 -0800327
328 // BLOCK 3
Frank Barchard13916042019-12-11 10:56:34 -0800329 VMLA.F32 q10, q6, d5[1]
Frank Barchard13916042019-12-11 10:56:34 -0800330 VMLA.F32 q12, q6, d6[1]
Frank Barchard13916042019-12-11 10:56:34 -0800331 VMLA.F32 q14, q6, d7[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800332 TST r5, 15
Frank Barchardc01d8a42020-01-08 16:33:27 -0800333
334 // BLOCK 4
Frank Barchard279908a2019-12-30 17:07:46 -0800335 VMLA.F32 q9, q7, d4[1]
336 VMLA.F32 q11, q7, d5[1]
337 VMLA.F32 q13, q7, d6[1]
Frank Barchard13916042019-12-11 10:56:34 -0800338
Frank Barchardc01d8a42020-01-08 16:33:27 -0800339 // BLOCK 5
Frank Barchardc01d8a42020-01-08 16:33:27 -0800340 VMLA.F32 q15, q7, d7[1]
Frank Barchard54afb132020-01-10 19:12:28 -0800341
Frank Barchard13916042019-12-11 10:56:34 -0800342 // Is there a remainder?- 1 to 3 floats of A (4, 8 or 12 bytes)
Frank Barchard9b499d62020-02-12 11:05:00 -0800343 BNE 5f
Frank Barchard13916042019-12-11 10:56:34 -0800344
345 .p2align 3
Frank Barchard9b499d62020-02-12 11:05:00 -08003464:
Frank Barchard13916042019-12-11 10:56:34 -0800347 // Load params pointer
Frank Barchard54afb132020-01-10 19:12:28 -0800348 LDR r0, [sp, 116] // cn_stride
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700349 LDR r5, [sp, 120] // params
Frank Barchard54afb132020-01-10 19:12:28 -0800350 LDR r2, [sp, 0] // kc
351 SUBS r1, r1, 8
Frank Barchard13916042019-12-11 10:56:34 -0800352
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700353 // Load min/max values
Frank Barchard13916042019-12-11 10:56:34 -0800354 VLD1.32 {d4[],d5[]}, [r5]!
355 VLD1.32 {d6[],d7[]}, [r5]
356
357 // Clamp
Marat Dukhana51cf482020-04-08 16:16:19 -0700358 VMAX.F32 q8, q8, q2
359 VMAX.F32 q9, q9, q2
360 VMAX.F32 q10, q10, q2
361 VMAX.F32 q11, q11, q2
362 VMAX.F32 q12, q12, q2
363 VMAX.F32 q13, q13, q2
364 VMAX.F32 q14, q14, q2
365 VMAX.F32 q15, q15, q2
366 VMIN.F32 q8, q8, q3
367 VMIN.F32 q9, q9, q3
368 VMIN.F32 q10, q10, q3
369 VMIN.F32 q11, q11, q3
370 VMIN.F32 q12, q12, q3
371 VMIN.F32 q13, q13, q3
372 VMIN.F32 q14, q14, q3
373 VMIN.F32 q15, q15, q3
Frank Barchard13916042019-12-11 10:56:34 -0800374
375 // Store full 4 x 8
Frank Barchard13916042019-12-11 10:56:34 -0800376 BLO 10f
Frank Barchard54afb132020-01-10 19:12:28 -0800377 VST1.32 {d16-d19}, [r11], r0
378 SUB r7, r7, r2
379 VST1.32 {d20-d23}, [r4], r0
Frank Barchard13916042019-12-11 10:56:34 -0800380 SUB r10, r10, r2
Frank Barchard54afb132020-01-10 19:12:28 -0800381 VST1.32 {d24-d27}, [r8], r0
Frank Barchard13916042019-12-11 10:56:34 -0800382 SUB r12, r12, r2
Frank Barchard54afb132020-01-10 19:12:28 -0800383 VST1.32 {d28-d31}, [r6], r0
Frank Barchard13916042019-12-11 10:56:34 -0800384 SUB r3, r3, r2
385 BHI 1b
386
Frank Barchard54afb132020-01-10 19:12:28 -0800387 ADD sp, sp, 4
388 POP {r4, r5, r6, r7, r8, r9, r10, r11}
Frank Barchard13916042019-12-11 10:56:34 -0800389 VPOP {d8-d15}
Frank Barchard54afb132020-01-10 19:12:28 -0800390 BX lr
Frank Barchard13916042019-12-11 10:56:34 -0800391
392 .p2align 3
Frank Barchard9b499d62020-02-12 11:05:00 -08003935:
Frank Barchard13916042019-12-11 10:56:34 -0800394 // Is there a remainder?- 2 floats of A (8 bytes)
395 TST r5, 8
Frank Barchard9b499d62020-02-12 11:05:00 -0800396 BEQ 6f
Frank Barchard13916042019-12-11 10:56:34 -0800397
398 // Remainder - 2 floats of A (8 bytes)
399 VLD1.32 {d0}, [r3]! // A0
400 VLDM r9!, {d8-d11} // B0
401 VLD1.32 {d1}, [r12]! // A1
402 VLD1.32 {d2}, [r10]! // A2
Frank Barchard54afb132020-01-10 19:12:28 -0800403 VLD1.32 {d3}, [ r7]! // A3
Frank Barchard13916042019-12-11 10:56:34 -0800404
405 VMLA.F32 q8, q4, d0[0]
406 VMLA.F32 q9, q5, d0[0]
407 VMLA.F32 q10, q4, d1[0]
408 VMLA.F32 q11, q5, d1[0]
409 VLDM r9!, {d12-d15} // B1
410 VMLA.F32 q12, q4, d2[0]
411 VMLA.F32 q13, q5, d2[0]
412 VMLA.F32 q14, q4, d3[0]
413 VMLA.F32 q15, q5, d3[0]
414 VMLA.F32 q8, q6, d0[1]
415 VMLA.F32 q9, q7, d0[1]
416 VMLA.F32 q10, q6, d1[1]
417 VMLA.F32 q11, q7, d1[1]
418 VMLA.F32 q12, q6, d2[1]
419 VMLA.F32 q13, q7, d2[1]
420 VMLA.F32 q14, q6, d3[1]
421 VMLA.F32 q15, q7, d3[1]
Frank Barchard9b499d62020-02-12 11:05:00 -0800422
Frank Barchard13916042019-12-11 10:56:34 -0800423 // Is there a remainder?- 1 floats of A (4 bytes)
424 TST r5, 4
Frank Barchard9b499d62020-02-12 11:05:00 -0800425 BEQ 4b
Frank Barchard13916042019-12-11 10:56:34 -0800426
Frank Barchard9b499d62020-02-12 11:05:00 -08004276:
Frank Barchard13916042019-12-11 10:56:34 -0800428 // Remainder- 1 floats of A (4 bytes)
429 VLDM r3!, {s0} // A0
430 VLDM r9!, {d8-d11} // B0
431 VLDM r12!, {s2} // A1
432 VLDM r10!, {s4} // A2
Frank Barchard54afb132020-01-10 19:12:28 -0800433 VLDM r7!, {s6} // A3
Frank Barchard13916042019-12-11 10:56:34 -0800434 VMLA.F32 q8, q4, d0[0]
435 VMLA.F32 q9, q5, d0[0]
436 VMLA.F32 q10, q4, d1[0]
437 VMLA.F32 q11, q5, d1[0]
438 VMLA.F32 q12, q4, d2[0]
439 VMLA.F32 q13, q5, d2[0]
440 VMLA.F32 q14, q4, d3[0]
441 VMLA.F32 q15, q5, d3[0]
Frank Barchard9b499d62020-02-12 11:05:00 -0800442 B 4b
Frank Barchard13916042019-12-11 10:56:34 -0800443
Frank Barchardc01d8a42020-01-08 16:33:27 -0800444 // Store odd width
Frank Barchard13916042019-12-11 10:56:34 -080044510:
446 TST r1, 4
447 BEQ 11f
448 VST1.32 {d16-d17}, [r11]!
449 VMOV q8, q9
450 VST1.32 {d20-d21}, [r4]!
451 VMOV q10, q11
452 VST1.32 {d24-d25}, [r8]!
453 VMOV q12, q13
454 VST1.32 {d28-d29}, [r6]!
455 VMOV q14, q15
456
45711:
458 TST r1, 2
459 BEQ 12f
460 VST1.32 {d16}, [r11]!
461 VMOV d16, d17
462 VST1.32 {d20}, [r4]!
463 VMOV d20, d21
464 VST1.32 {d24}, [r8]!
465 VMOV d24, d25
466 VST1.32 {d28}, [r6]!
467 VMOV d28, d29
468
46912:
470 TST r1, 1
471 BEQ 13f
472 VST1.32 {d16[0]}, [r11]
473 VST1.32 {d20[0]}, [r4]
474 VST1.32 {d24[0]}, [r8]
475 VST1.32 {d28[0]}, [r6]
476
47713:
Frank Barchard54afb132020-01-10 19:12:28 -0800478 ADD sp, sp, 4
479 POP {r4, r5, r6, r7, r8, r9, r10, r11}
Frank Barchard13916042019-12-11 10:56:34 -0800480 VPOP {d8-d15}
Frank Barchard54afb132020-01-10 19:12:28 -0800481 BX lr
Frank Barchard13916042019-12-11 10:56:34 -0800482
483END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53
484
485#ifdef __ELF__
486.section ".note.GNU-stack","",%progbits
Frank Barchard005feb82019-12-16 18:16:55 -0800487#endif