Frank Barchard | abf8154 | 2019-12-13 16:18:30 -0800 | [diff] [blame] | 1 | // Auto-generated file. Do not edit! |
| 2 | // Template: src/f32-gemm/4x8-aarch32-neon-cortex-a75.S.in |
| 3 | // Generator: tools/xngen |
| 4 | // |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 5 | // Copyright 2019 Google LLC |
| 6 | // |
| 7 | // This source code is licensed under the BSD-style license found in the |
| 8 | // LICENSE file in the root directory of this source tree. |
| 9 | |
| 10 | #include <xnnpack/assembly.h> |
| 11 | |
| 12 | .syntax unified |
| 13 | |
| 14 | // void xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75( |
| 15 | // size_t mr, r0 |
| 16 | // size_t nc, r1 |
| 17 | // size_t kc, r2 -> r5 |
| 18 | // const uint8_t*restrict a, r3 |
| 19 | // size_t a_stride, sp + 96 -> (r7) |
| 20 | // const void*restrict w, sp + 100 -> r9 |
| 21 | // uint8_t*restrict c, sp + 104 -> r11 |
| 22 | // size_t cm_stride, sp + 108 -> (r6) |
| 23 | // size_t cn_stride, sp + 112 -> r7 |
Marat Dukhan | eb09a6b | 2020-04-08 17:34:32 -0700 | [diff] [blame] | 24 | // const union xnn_f32_minmax_params params[restrict static 1]) sp + 116 -> (r7) |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 25 | |
| 26 | |
| 27 | // inner loop registers |
| 28 | |
| 29 | // A0 r3 d0 |
| 30 | // A1 r12 d1 |
| 31 | // A2 r10 d2 |
| 32 | // A3 r0 d3 |
| 33 | |
| 34 | // B r9 d8, d9, d10, d11 |
| 35 | // B d12, d13, d14, d15 |
| 36 | |
| 37 | // C0 r11 d16-d17 q8 d18-d19 q9 |
| 38 | // C1 r4 d20-d21 q10 d22-d23 q11 |
| 39 | // C2 r8 d24-d25 q12 d26-d27 q13 |
| 40 | // C3 r6 d28-d29 q14 d30-d31 q15 |
| 41 | |
| 42 | // Clamp (r5) d4 d5 d6 d7 |
| 43 | |
| 44 | BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75 |
| 45 | .arm |
| 46 | #ifndef __APPLE__ |
| 47 | .arch armv7-a |
| 48 | .fpu neon |
| 49 | #endif |
| 50 | // Push 96 bytes |
| 51 | PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 |
| 52 | VPUSH {d8-d15} // +64 = 96 |
| 53 | |
| 54 | LDR r7, [sp, 96] // a_stride |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 55 | LDR r6, [sp, 108] // cm_stride |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 56 | LDR r11, [sp, 104] // c |
| 57 | LDR r9, [sp, 100] // w |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 58 | |
| 59 | // Clamp A and C pointers |
| 60 | CMP r0, 2 // if mr >= 2 |
| 61 | ADD r12, r3, r7 // a1 = a0 + a_stride |
| 62 | ADD r4, r11, r6 // c1 = c0 + cm_stride |
| 63 | MOVLO r12, r3 // a1 |
| 64 | MOVLO r4, r11 // c1 |
| 65 | // if mr > 2 |
| 66 | ADD r10, r12, r7 // a2 = a1 + a_stride |
| 67 | ADD r8, r4, r6 // c2 = c1 + cm_stride |
| 68 | MOVLS r10, r12 // a2 |
| 69 | MOVLS r8, r4 // c2 |
| 70 | |
| 71 | CMP r0, 4 // if mr >=4 |
| 72 | ADD r0, r10, r7 // a3 = a2 + a_stride |
| 73 | ADD r6, r8, r6 // c3 = c2 + cm_stride |
| 74 | MOVLO r0, r10 // a3 |
| 75 | MOVLO r6, r8 // c3 |
| 76 | |
| 77 | LDR r7, [sp, 112] // cn_stride |
| 78 | |
| 79 | .p2align 3 |
| 80 | 1: |
| 81 | # Load initial bias from w into accumulators |
| 82 | VLDM r9!, {d16-d19} // Bias |
| 83 | SUBS r5, r2, 16 |
| 84 | VMOV q10, q8 |
| 85 | VMOV q11, q9 |
| 86 | VMOV q12, q8 |
| 87 | VMOV q13, q9 |
| 88 | VMOV q14, q8 |
| 89 | VMOV q15, q9 |
Frank Barchard | abf8154 | 2019-12-13 16:18:30 -0800 | [diff] [blame] | 90 | |
| 91 | |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 92 | BLO 5f // less than 4 channels? |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 93 | |
Frank Barchard | abf8154 | 2019-12-13 16:18:30 -0800 | [diff] [blame] | 94 | // Prologue |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 95 | VLD1.32 {d0}, [r3]! // A0 |
| 96 | VLDM r9!, {d8-d11} // B0 |
| 97 | VLD1.32 {d1}, [r12]! // A1 |
| 98 | VLD1.32 {d2}, [r10]! // A2 |
| 99 | VLD1.32 {d3}, [ r0]! // A3 |
| 100 | |
| 101 | SUBS r5, r5, 16 |
| 102 | BLO 3f // less than 4 channels? skip main loop |
| 103 | |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 104 | .p2align 3 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 105 | |
| 106 | // Main loop - 4 floats of A (16 bytes) |
| 107 | 2: |
| 108 | VMLA.F32 q8, q4, d0[0] |
| 109 | VLDM r9!, {d12-d15} // B1 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 110 | VMLA.F32 q10, q4, d1[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 111 | VMLA.F32 q12, q4, d2[0] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 112 | VLD1.32 {d4}, [r3]! // A0 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 113 | VMLA.F32 q14, q4, d3[0] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 114 | VMLA.F32 q9, q5, d0[0] |
| 115 | VLD1.32 {d5}, [r12]! // A1 |
| 116 | VMLA.F32 q11, q5, d1[0] |
| 117 | VMLA.F32 q13, q5, d2[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 118 | VMLA.F32 q15, q5, d3[0] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 119 | VLD1.32 {d6}, [r10]! // A2 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 120 | VMLA.F32 q8, q6, d0[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 121 | VMLA.F32 q10, q6, d1[1] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 122 | VLD1.32 {d7}, [ r0]! // A3 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 123 | VMLA.F32 q12, q6, d2[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 124 | VMLA.F32 q14, q6, d3[1] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 125 | VLDM r9!, {d8-d11} // B0 |
| 126 | VMLA.F32 q9, q7, d0[1] |
| 127 | VMLA.F32 q11, q7, d1[1] |
| 128 | VMLA.F32 q13, q7, d2[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 129 | VMLA.F32 q15, q7, d3[1] |
| 130 | |
| 131 | VMLA.F32 q8, q4, d4[0] |
| 132 | VLDM r9!, {d12-d15} // B1 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 133 | VMLA.F32 q10, q4, d5[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 134 | VMLA.F32 q12, q4, d6[0] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 135 | VLD1.32 {d0}, [r3]! // A0 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 136 | VMLA.F32 q14, q4, d7[0] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 137 | VMLA.F32 q9, q5, d4[0] |
| 138 | VLD1.32 {d1}, [r12]! // A1 |
| 139 | VMLA.F32 q11, q5, d5[0] |
| 140 | VMLA.F32 q13, q5, d6[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 141 | VLD1.32 {d2}, [r10]! // A2 |
| 142 | VMLA.F32 q15, q5, d7[0] |
| 143 | VMLA.F32 q8, q6, d4[1] |
| 144 | VLD1.32 {d3}, [ r0]! // A3 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 145 | VMLA.F32 q10, q6, d5[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 146 | VMLA.F32 q12, q6, d6[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 147 | VMLA.F32 q14, q6, d7[1] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 148 | VLDM r9!, {d8-d11} // B0 |
| 149 | VMLA.F32 q9, q7, d4[1] |
| 150 | VMLA.F32 q11, q7, d5[1] |
Frank Barchard | 0090f5b | 2019-12-16 17:02:57 -0800 | [diff] [blame] | 151 | SUBS r5, r5, 16 |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 152 | VMLA.F32 q13, q7, d6[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 153 | VMLA.F32 q15, q7, d7[1] |
| 154 | BHS 2b |
| 155 | |
| 156 | // Epilogue |
| 157 | 3: |
| 158 | VMLA.F32 q8, q4, d0[0] |
| 159 | VLDM r9!, {d12-d15} // B1 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 160 | VMLA.F32 q10, q4, d1[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 161 | VMLA.F32 q12, q4, d2[0] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 162 | VLD1.32 {d4}, [r3]! // A0 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 163 | VMLA.F32 q14, q4, d3[0] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 164 | VMLA.F32 q9, q5, d0[0] |
| 165 | VLD1.32 {d5}, [r12]! // A1 |
| 166 | VMLA.F32 q11, q5, d1[0] |
| 167 | VMLA.F32 q13, q5, d2[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 168 | VMLA.F32 q15, q5, d3[0] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 169 | VLD1.32 {d6}, [r10]! // A2 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 170 | VMLA.F32 q8, q6, d0[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 171 | VMLA.F32 q10, q6, d1[1] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 172 | VLD1.32 {d7}, [ r0]! // A3 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 173 | VMLA.F32 q12, q6, d2[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 174 | VMLA.F32 q14, q6, d3[1] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 175 | VLDM r9!, {d8-d11} // B0 |
| 176 | VMLA.F32 q9, q7, d0[1] |
| 177 | VMLA.F32 q11, q7, d1[1] |
| 178 | VMLA.F32 q13, q7, d2[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 179 | VMLA.F32 q15, q7, d3[1] |
| 180 | |
| 181 | VMLA.F32 q8, q4, d4[0] |
| 182 | VLDM r9!, {d12-d15} // B1 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 183 | VMLA.F32 q10, q4, d5[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 184 | VMLA.F32 q12, q4, d6[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 185 | VMLA.F32 q14, q4, d7[0] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 186 | VMLA.F32 q9, q5, d4[0] |
| 187 | VMLA.F32 q11, q5, d5[0] |
| 188 | VMLA.F32 q13, q5, d6[0] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 189 | VMLA.F32 q15, q5, d7[0] |
| 190 | VMLA.F32 q8, q6, d4[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 191 | VMLA.F32 q10, q6, d5[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 192 | VMLA.F32 q12, q6, d6[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 193 | VMLA.F32 q14, q6, d7[1] |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 194 | VMLA.F32 q9, q7, d4[1] |
| 195 | VMLA.F32 q11, q7, d5[1] |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 196 | TST r5, 15 |
Frank Barchard | 279908a | 2019-12-30 17:07:46 -0800 | [diff] [blame] | 197 | VMLA.F32 q13, q7, d6[1] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 198 | VMLA.F32 q15, q7, d7[1] |
| 199 | |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 200 | // Is there a remainder?- 1 to 3 floats of A (4, 8 or 12 bytes) |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 201 | BNE 5f |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 202 | |
| 203 | .p2align 3 |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 204 | 4: |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 205 | // Load params pointer |
Marat Dukhan | eb09a6b | 2020-04-08 17:34:32 -0700 | [diff] [blame] | 206 | LDR r5, [sp, 116] // params |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 207 | |
Marat Dukhan | eb09a6b | 2020-04-08 17:34:32 -0700 | [diff] [blame] | 208 | // Load min/max values |
Frank Barchard | abf8154 | 2019-12-13 16:18:30 -0800 | [diff] [blame] | 209 | VLD1.32 {d4[],d5[]}, [r5]! |
| 210 | SUBS r1, r1, 8 |
| 211 | VLD1.32 {d6[],d7[]}, [r5] |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 212 | |
| 213 | // Clamp |
Marat Dukhan | a51cf48 | 2020-04-08 16:16:19 -0700 | [diff] [blame] | 214 | VMAX.F32 q8, q8, q2 |
| 215 | VMAX.F32 q9, q9, q2 |
| 216 | VMAX.F32 q10, q10, q2 |
| 217 | VMAX.F32 q11, q11, q2 |
| 218 | VMAX.F32 q12, q12, q2 |
| 219 | VMAX.F32 q13, q13, q2 |
| 220 | VMAX.F32 q14, q14, q2 |
| 221 | VMAX.F32 q15, q15, q2 |
| 222 | VMIN.F32 q8, q8, q3 |
| 223 | VMIN.F32 q9, q9, q3 |
| 224 | VMIN.F32 q10, q10, q3 |
| 225 | VMIN.F32 q11, q11, q3 |
| 226 | VMIN.F32 q12, q12, q3 |
| 227 | VMIN.F32 q13, q13, q3 |
| 228 | VMIN.F32 q14, q14, q3 |
| 229 | VMIN.F32 q15, q15, q3 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 230 | |
| 231 | // Store full 4 x 8 |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 232 | BLO 10f |
| 233 | VST1.32 {d16-d19}, [r11], r7 |
| 234 | SUB r0, r0, r2 |
| 235 | VST1.32 {d20-d23}, [r4], r7 |
| 236 | SUB r10, r10, r2 |
| 237 | VST1.32 {d24-d27}, [r8], r7 |
| 238 | SUB r12, r12, r2 |
| 239 | VST1.32 {d28-d31}, [r6], r7 |
| 240 | SUB r3, r3, r2 |
| 241 | BHI 1b |
| 242 | |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 243 | VPOP {d8-d15} |
| 244 | POP {r4, r5, r6, r7, r8, r9, r10, r11} |
| 245 | BX lr |
| 246 | |
| 247 | .p2align 3 |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 248 | 5: |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 249 | // Is there a remainder?- 2 floats of A (8 bytes) |
| 250 | TST r5, 8 |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 251 | BEQ 6f |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 252 | |
| 253 | // Remainder - 2 floats of A (8 bytes) |
| 254 | VLD1.32 {d0}, [r3]! // A0 |
| 255 | VLDM r9!, {d8-d11} // B0 |
| 256 | VLD1.32 {d1}, [r12]! // A1 |
| 257 | VLD1.32 {d2}, [r10]! // A2 |
| 258 | VLD1.32 {d3}, [ r0]! // A3 |
| 259 | |
| 260 | VMLA.F32 q8, q4, d0[0] |
| 261 | VMLA.F32 q9, q5, d0[0] |
| 262 | VMLA.F32 q10, q4, d1[0] |
| 263 | VMLA.F32 q11, q5, d1[0] |
| 264 | VLDM r9!, {d12-d15} // B1 |
| 265 | VMLA.F32 q12, q4, d2[0] |
| 266 | VMLA.F32 q13, q5, d2[0] |
| 267 | VMLA.F32 q14, q4, d3[0] |
| 268 | VMLA.F32 q15, q5, d3[0] |
| 269 | VMLA.F32 q8, q6, d0[1] |
| 270 | VMLA.F32 q9, q7, d0[1] |
| 271 | VMLA.F32 q10, q6, d1[1] |
| 272 | VMLA.F32 q11, q7, d1[1] |
| 273 | VMLA.F32 q12, q6, d2[1] |
| 274 | VMLA.F32 q13, q7, d2[1] |
| 275 | VMLA.F32 q14, q6, d3[1] |
| 276 | VMLA.F32 q15, q7, d3[1] |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 277 | |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 278 | // Is there a remainder?- 1 floats of A (4 bytes) |
| 279 | TST r5, 4 |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 280 | BEQ 4b |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 281 | |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 282 | 6: |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 283 | // Remainder- 1 floats of A (4 bytes) |
| 284 | VLDM r3!, {s0} // A0 |
| 285 | VLDM r9!, {d8-d11} // B0 |
| 286 | VLDM r12!, {s2} // A1 |
| 287 | VLDM r10!, {s4} // A2 |
| 288 | VLDM r0!, {s6} // A3 |
| 289 | VMLA.F32 q8, q4, d0[0] |
| 290 | VMLA.F32 q9, q5, d0[0] |
| 291 | VMLA.F32 q10, q4, d1[0] |
| 292 | VMLA.F32 q11, q5, d1[0] |
| 293 | VMLA.F32 q12, q4, d2[0] |
| 294 | VMLA.F32 q13, q5, d2[0] |
| 295 | VMLA.F32 q14, q4, d3[0] |
| 296 | VMLA.F32 q15, q5, d3[0] |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 297 | B 4b |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 298 | |
Frank Barchard | c1a0697 | 2020-02-12 11:34:17 -0800 | [diff] [blame] | 299 | // Store odd width |
Frank Barchard | 3e237f2 | 2019-12-04 23:08:51 -0800 | [diff] [blame] | 300 | 10: |
| 301 | TST r1, 4 |
| 302 | BEQ 11f |
| 303 | VST1.32 {d16-d17}, [r11]! |
| 304 | VMOV q8, q9 |
| 305 | VST1.32 {d20-d21}, [r4]! |
| 306 | VMOV q10, q11 |
| 307 | VST1.32 {d24-d25}, [r8]! |
| 308 | VMOV q12, q13 |
| 309 | VST1.32 {d28-d29}, [r6]! |
| 310 | VMOV q14, q15 |
| 311 | |
| 312 | 11: |
| 313 | TST r1, 2 |
| 314 | BEQ 12f |
| 315 | VST1.32 {d16}, [r11]! |
| 316 | VMOV d16, d17 |
| 317 | VST1.32 {d20}, [r4]! |
| 318 | VMOV d20, d21 |
| 319 | VST1.32 {d24}, [r8]! |
| 320 | VMOV d24, d25 |
| 321 | VST1.32 {d28}, [r6]! |
| 322 | VMOV d28, d29 |
| 323 | |
| 324 | 12: |
| 325 | TST r1, 1 |
| 326 | BEQ 13f |
| 327 | VST1.32 {d16[0]}, [r11] |
| 328 | VST1.32 {d20[0]}, [r4] |
| 329 | VST1.32 {d24[0]}, [r8] |
| 330 | VST1.32 {d28[0]}, [r6] |
| 331 | |
| 332 | 13: |
| 333 | VPOP {d8-d15} |
| 334 | POP {r4, r5, r6, r7, r8, r9, r10, r11} |
| 335 | BX lr |
| 336 | |
| 337 | END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75 |
| 338 | |
| 339 | #ifdef __ELF__ |
| 340 | .section ".note.GNU-stack","",%progbits |
| 341 | #endif |
| 342 | |
| 343 | |
| 344 | |