blob: ec248889d250de087812ee88ccd69e78a16fd629 [file] [log] [blame]
Frank Barcharddc38f072020-02-10 13:21:42 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8.syntax unified
9
Frank Barchard569561d2020-06-17 13:11:12 -070010// void xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_${"pld_" if PREFETCH else ""}ld64(
Frank Barcharddc38f072020-02-10 13:21:42 -080011// size_t mr, r0
12// size_t nc, r1
Frank Barchardfd262e12020-02-10 23:45:48 -080013// size_t kc, r2 -> r5 -> sp + 68
14// size_t ks, r3 -> sp + 72 -> r14
15// const float**restrict a, sp + 112 -> r2
16// const void*restrict w, sp + 116 -> r9
17// uint8_t*restrict c, sp + 120 -> r11
18// size_t cm_stride, sp + 124 -> (r6)
19// size_t cn_stride, sp + 128 -> (r7)
20// size_t a_offset, sp + 132 -> (r5)
21// const float* zero, sp + 136 -> (r7)
Marat Dukhaneb09a6b2020-04-08 17:34:32 -070022// minmax_params*params, sp + 140 -> (r5)
Frank Barcharddc38f072020-02-10 13:21:42 -080023
24// inner loop registers
25
26// A0 r3 d0
27// A1 r12 d1
28// A2 r10 d2
29// A3 r0 d3
30
31// B r9 d8, d9, d10, d11
32// B d12, d13, d14, d15
33
34// C0 r11 d16-d17 q8 d18-d19 q9
35// C1 r4 d20-d21 q10 d22-d23 q11
36// C2 r8 d24-d25 q12 d26-d27 q13
37// C3 r6 d28-d29 q14 d30-d31 q15
38
39// Clamp (r5) d4 d5 d6 d7
40
Frank Barchard569561d2020-06-17 13:11:12 -070041BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_${"pld_" if PREFETCH else ""}ld64
Frank Barcharddc38f072020-02-10 13:21:42 -080042 .arm
43#ifndef __APPLE__
Frank Barchard76f43f02021-05-12 14:52:01 -070044 .arch armv7-a
45 .fpu neon
Frank Barcharddc38f072020-02-10 13:21:42 -080046#endif
Frank Barchard7c9f1f92021-06-04 14:38:55 -070047 # Push 112 bytes
48 # r2 will be reloaded in outer loop. r3 is ks
Frank Barchard76f43f02021-05-12 14:52:01 -070049 PUSH {r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r14} // +44
50 SUB sp, sp, 4 // 4
51 VPUSH {d8-d15} // +64 = 112
Frank Barcharddc38f072020-02-10 13:21:42 -080052
Frank Barchard76f43f02021-05-12 14:52:01 -070053 LDR r11, [sp, 120] // c
54 LDR r6, [sp, 124] // cm_stride
55 LDR r2, [sp, 112] // a
56 LDR r9, [sp, 116] // w
57 LDR r5, [sp, 140] // params
58 MOV r14, r3 // p = ks
Frank Barcharddc38f072020-02-10 13:21:42 -080059
Frank Barchard7c9f1f92021-06-04 14:38:55 -070060 # Clamp C pointers
Frank Barchard76f43f02021-05-12 14:52:01 -070061 CMP r0, 2 // if mr >= 2
62 ADD r4, r11, r6 // c1 = c0 + cm_stride
63 MOVLO r4, r11 // c1
Frank Barchard7c9f1f92021-06-04 14:38:55 -070064 // if mr > 2
Frank Barchard76f43f02021-05-12 14:52:01 -070065 ADD r8, r4, r6 // c2 = c1 + cm_stride
66 MOVLS r8, r4 // c2
67 CMP r0, 4 // if mr >=4
68 ADD r6, r8, r6 // c3 = c2 + cm_stride
69 MOVLO r6, r8 // c3
Frank Barcharddc38f072020-02-10 13:21:42 -080070
Frank Barchard7c9f1f92021-06-04 14:38:55 -070071 # Load min/max values
Frank Barchard76f43f02021-05-12 14:52:01 -070072 VLD1.32 {d4[], d5[]}, [r5]!
73 VLD1.32 {d6[], d7[]}, [r5]
Frank Barcharddc38f072020-02-10 13:21:42 -080074
750:
76 # Load initial bias from w into accumulators
Frank Barchard76f43f02021-05-12 14:52:01 -070077 VLDM r9!, {d16-d19} // Bias
78 VMOV q10, q8
79 VMOV q11, q9
80 VMOV q12, q8
81 VMOV q13, q9
82 VMOV q14, q8
83 VMOV q15, q9
Frank Barcharddc38f072020-02-10 13:21:42 -080084
Frank Barchard569561d2020-06-17 13:11:12 -070085 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -070086 PLD [r9, 0] // Prefetch B
87 PLD [r9, 64]
88 PLD [r9, 128]
89 PLD [r9, 192]
90 PLD [r9, 256]
91 PLD [r9, 320]
92 PLD [r9, 384]
93 PLD [r9, 448]
Frank Barcharddc38f072020-02-10 13:21:42 -0800941:
95 # Load next 4 A pointers
Frank Barchard76f43f02021-05-12 14:52:01 -070096 LDR r3, [r2, 0]
97 LDR r12, [r2, 4]
98 LDR r10, [r2, 8]
99 LDR r0, [r2, 12]
100 ADD r2, r2, 16
Frank Barcharddc38f072020-02-10 13:21:42 -0800101
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700102 # Add a_offset
Frank Barchard76f43f02021-05-12 14:52:01 -0700103 LDR r5, [sp, 132] // a_offset
104 LDR r7, [sp, 136] // zero
105 CMP r3, r7 // if a0 == zero
106 ADD r3, r3, r5 // a0 += a_offset
107 MOVEQ r3, r7 // a0 = zero, else += a0 + a_offset
108 CMP r12, r7 // if a1 == zero
109 ADD r12, r12, r5 // a1 += a_offset
110 MOVEQ r12, r7 // a1 = zero, else += a1 + a_offset
111 CMP r10, r7 // if a2 == zero
112 ADD r10, r10, r5 // a2 += a_offset
113 MOVEQ r10, r7 // a2 = zero, else += a2 + a_offset
114 CMP r0, r7 // if a3 == zero
115 ADD r0, r0, r5 // a3 += a_offset
116 LDR r5, [sp, 68] // kc
117 MOVEQ r0, r7 // a3 = zero, else += a3 + a_offset
Frank Barcharddc38f072020-02-10 13:21:42 -0800118
Frank Barchard569561d2020-06-17 13:11:12 -0700119 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -0700120 PLD [r3, 0] // Prefetch A
121 PLD [r3, 64]
122 PLD [r12, 0]
123 PLD [r12, 64]
124 PLD [r10, 0]
125 PLD [r10, 64]
126 PLD [r0, 0]
127 PLD [r0, 64]
Frank Barchard569561d2020-06-17 13:11:12 -0700128
Frank Barchard76f43f02021-05-12 14:52:01 -0700129 SUBS r5, r5, 8 // kc - 8
130 BLO 4f // less than 2 channels?
Frank Barcharddc38f072020-02-10 13:21:42 -0800131
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700132 # Main loop - 2 floats of A (8 bytes)
Frank Barcharddc38f072020-02-10 13:21:42 -08001332:
Frank Barchard76f43f02021-05-12 14:52:01 -0700134 VLD1.32 {d0}, [r3]! // A0
135 VLDM r9!, {d8-d11} // B0
136 VLD1.32 {d1}, [r12]! // A1
137 VLD1.32 {d2}, [r10]! // A2
138 VLD1.32 {d3}, [ r0]! // A3
139 VLDM r9!, {d12-d15} // B1
Frank Barcharddc38f072020-02-10 13:21:42 -0800140
Frank Barchard76f43f02021-05-12 14:52:01 -0700141 VMLA.F32 q8, q4, d0[0]
142 VMLA.F32 q9, q5, d0[0]
Frank Barchard569561d2020-06-17 13:11:12 -0700143 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -0700144 PLD [r3, 128] // Prefetch A0
145 VMLA.F32 q10, q4, d1[0]
146 VMLA.F32 q11, q5, d1[0]
Frank Barchardefc10142020-07-08 11:46:57 -0700147 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -0700148 PLD [r12, 128] // Prefetch A1
149 VMLA.F32 q12, q4, d2[0]
150 VMLA.F32 q13, q5, d2[0]
Frank Barchard569561d2020-06-17 13:11:12 -0700151 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -0700152 PLD [r10, 128] // Prefetch A2
153 VMLA.F32 q14, q4, d3[0]
154 VMLA.F32 q15, q5, d3[0]
Frank Barchard569561d2020-06-17 13:11:12 -0700155 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -0700156 PLD [r0, 128] // Prefetch A3
157 VMLA.F32 q8, q6, d0[1]
158 VMLA.F32 q9, q7, d0[1]
Frank Barchard569561d2020-06-17 13:11:12 -0700159 $if PREFETCH:
Frank Barchard76f43f02021-05-12 14:52:01 -0700160 PLD [r9, 448] // Prefetch B
161 VMLA.F32 q10, q6, d1[1]
162 VMLA.F32 q11, q7, d1[1]
163 SUBS r5, r5, 8
164 VMLA.F32 q12, q6, d2[1]
165 VMLA.F32 q13, q7, d2[1]
166 VMLA.F32 q14, q6, d3[1]
167 VMLA.F32 q15, q7, d3[1]
168 BHS 2b
Frank Barcharddc38f072020-02-10 13:21:42 -0800169
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700170 # Is there a remainder?- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700171 TST r5, 4
172 BNE 4f
Frank Barcharddc38f072020-02-10 13:21:42 -0800173
Frank Barchard3b262062020-09-30 15:53:17 -07001743:
Frank Barcharddc38f072020-02-10 13:21:42 -0800175 # ks loop
Frank Barchard76f43f02021-05-12 14:52:01 -0700176 SUBS r14, r14, 16 // ks -= MR * sizeof(void*)
177 BHI 1b
Frank Barcharddc38f072020-02-10 13:21:42 -0800178
Frank Barchard76f43f02021-05-12 14:52:01 -0700179 LDR r7, [sp, 128] // cn_stride
180 LDR r14, [sp, 72] // p = ks
Frank Barcharddc38f072020-02-10 13:21:42 -0800181
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700182 # Clamp
Frank Barchard76f43f02021-05-12 14:52:01 -0700183 VMAX.F32 q8, q8, q2
184 SUBS r1, r1, 8
185 VMAX.F32 q9, q9, q2
186 VMAX.F32 q10, q10, q2
187 VMAX.F32 q11, q11, q2
188 VMAX.F32 q12, q12, q2
189 VMAX.F32 q13, q13, q2
190 VMAX.F32 q14, q14, q2
191 VMAX.F32 q15, q15, q2
192 VMIN.F32 q8, q8, q3
193 VMIN.F32 q9, q9, q3
194 VMIN.F32 q10, q10, q3
195 VMIN.F32 q11, q11, q3
196 VMIN.F32 q12, q12, q3
197 VMIN.F32 q13, q13, q3
198 VMIN.F32 q14, q14, q3
199 VMIN.F32 q15, q15, q3
Frank Barcharddc38f072020-02-10 13:21:42 -0800200
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700201 # Store full 4 x 8
Frank Barchard76f43f02021-05-12 14:52:01 -0700202 BLO 5f
203 VST1.32 {d28-d31}, [r6], r7
204 VST1.32 {d24-d27}, [r8], r7
205 VST1.32 {d20-d23}, [r4], r7
206 VST1.32 {d16-d19}, [r11], r7
207 SUB r2, r2, r14 // a -= ks
208 BHI 0b
Frank Barcharddc38f072020-02-10 13:21:42 -0800209
Frank Barchard76f43f02021-05-12 14:52:01 -0700210 VPOP {d8-d15}
211 ADD sp, sp, 12 // skip pad, r2, r3
212 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
Frank Barcharddc38f072020-02-10 13:21:42 -0800213
Frank Barchard3b262062020-09-30 15:53:17 -07002144:
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700215 # Remainder- 1 floats of A (4 bytes)
Frank Barchard76f43f02021-05-12 14:52:01 -0700216 VLDM r3!, {s0} // A0
217 VLDM r9!, {d8-d11} // B0
218 VLDM r12!, {s2} // A1
219 VLDM r10!, {s4} // A2
220 VLDM r0!, {s6} // A3
221 VMLA.F32 q8, q4, d0[0]
222 VMLA.F32 q9, q5, d0[0]
223 VMLA.F32 q10, q4, d1[0]
224 VMLA.F32 q11, q5, d1[0]
225 VMLA.F32 q12, q4, d2[0]
226 VMLA.F32 q13, q5, d2[0]
227 VMLA.F32 q14, q4, d3[0]
228 VMLA.F32 q15, q5, d3[0]
229 B 3b
Frank Barcharddc38f072020-02-10 13:21:42 -0800230
Frank Barchard7c9f1f92021-06-04 14:38:55 -0700231 # Store odd width
Frank Barchard3b262062020-09-30 15:53:17 -07002325:
Frank Barchard76f43f02021-05-12 14:52:01 -0700233 TST r1, 4
234 BEQ 6f
235 VST1.32 {d28-d29}, [r6]!
236 VMOV q14, q15
237 VST1.32 {d24-d25}, [r8]!
238 VMOV q12, q13
239 VST1.32 {d20-d21}, [r4]!
240 VMOV q10, q11
241 VST1.32 {d16-d17}, [r11]!
242 VMOV q8, q9
Frank Barcharddc38f072020-02-10 13:21:42 -0800243
Frank Barchard3b262062020-09-30 15:53:17 -07002446:
Frank Barchard76f43f02021-05-12 14:52:01 -0700245 TST r1, 2
246 BEQ 7f
247 VST1.32 {d28}, [r6]!
248 VMOV d28, d29
249 VST1.32 {d24}, [r8]!
250 VMOV d24, d25
251 VST1.32 {d20}, [r4]!
252 VMOV d20, d21
253 VST1.32 {d16}, [r11]!
254 VMOV d16, d17
Frank Barcharddc38f072020-02-10 13:21:42 -0800255
Frank Barchard3b262062020-09-30 15:53:17 -07002567:
Frank Barchard76f43f02021-05-12 14:52:01 -0700257 TST r1, 1
258 BEQ 8f
259 VST1.32 {d28[0]}, [r6]!
260 VST1.32 {d24[0]}, [r8]!
261 VST1.32 {d20[0]}, [r4]!
262 VST1.32 {d16[0]}, [r11]!
Frank Barcharddc38f072020-02-10 13:21:42 -0800263
Frank Barchard3b262062020-09-30 15:53:17 -07002648:
Frank Barchard76f43f02021-05-12 14:52:01 -0700265 VPOP {d8-d15}
266 ADD sp, sp, 12 // skip pad, r2, r3
267 POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
Frank Barcharddc38f072020-02-10 13:21:42 -0800268
Frank Barchard569561d2020-06-17 13:11:12 -0700269END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_${"pld_" if PREFETCH else ""}ld64
Frank Barcharddc38f072020-02-10 13:21:42 -0800270
271#ifdef __ELF__
272.section ".note.GNU-stack","",%progbits
273#endif