XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 1 | // Copyright 2019 Google LLC |
| 2 | // |
| 3 | // This source code is licensed under the BSD-style license found in the |
| 4 | // LICENSE file in the root directory of this source tree. |
| 5 | |
| 6 | $assert MR % 4 == 0 |
| 7 | $assert NR in [1, 2, 4] |
Marat Dukhan | f32ae34 | 2020-03-10 19:21:17 -0700 | [diff] [blame] | 8 | $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 9 | #include <assert.h> |
| 10 | |
| 11 | #include <arm_neon.h> |
| 12 | |
| 13 | #include <xnnpack/spmm.h> |
| 14 | |
| 15 | |
Marat Dukhan | 355ab43 | 2020-04-09 19:01:52 -0700 | [diff] [blame] | 16 | void xnn_f32_spmm_minmax_ukernel_${MR}x${NR}__${"neonfma" if FMA else "neon"}( |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 17 | size_t mc, |
| 18 | size_t nc, |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 19 | const float*restrict input, |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 20 | const float*restrict weights, |
| 21 | const int32_t*restrict widx_dmap, |
| 22 | const uint32_t*restrict nidx_nnzmap, |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 23 | float*restrict output, |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 24 | size_t output_stride, |
Marat Dukhan | f196d01 | 2020-04-15 11:50:03 -0700 | [diff] [blame] | 25 | const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 26 | { |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 27 | assert(mc != 0); |
| 28 | assert(mc % sizeof(float) == 0); |
| 29 | assert(nc != 0); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 30 | |
Frank Barchard | fcfdc0e | 2019-10-21 15:58:42 -0700 | [diff] [blame] | 31 | const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min); |
| 32 | const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max); |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 33 | size_t output_decrement = output_stride * nc - ${MR} * sizeof(float); |
| 34 | while XNN_LIKELY(mc >= ${MR} * sizeof(float)) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 35 | const float*restrict w = weights; |
| 36 | const int32_t* dmap = widx_dmap; |
| 37 | const uint32_t* nnzmap = nidx_nnzmap; |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 38 | size_t n = nc; |
| 39 | while (n >= ${NR}) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 40 | uint32_t nnz = *nnzmap++; |
| 41 | $for N in range(0, NR, 1): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 42 | float32x4_t vacc${ABC[0:4]}n${N} = vld1q_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 43 | $for M in range(4, MR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 44 | float32x4_t vacc${ABC[M:M+4]}n${N} = vacc${ABC[0:4]}n${N}; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 45 | if XNN_LIKELY(nnz != 0) { |
| 46 | do { |
| 47 | const intptr_t diff = *dmap++; |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 48 | const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 49 | $for M in range(4, MR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 50 | const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 51 | input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); |
Frank Barchard | 2da0de8 | 2020-11-09 22:42:11 -0800 | [diff] [blame] | 52 | $for M in range(0, MR, 16): |
| 53 | __builtin_prefetch(input + ${M+16}); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 54 | $if NR == 1: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 55 | const float32x4_t vw = vld1q_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 56 | $elif NR == 2: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 57 | const float32x2_t vw = vld1_f32(w); w += 2; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 58 | $elif NR == 4: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 59 | const float32x4_t vw = vld1q_f32(w); w += 4; |
Frank Barchard | 2da0de8 | 2020-11-09 22:42:11 -0800 | [diff] [blame] | 60 | __builtin_prefetch(w + 32); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 61 | $if NR == 1: |
| 62 | $for M in range(0, MR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 63 | vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 64 | $else: |
| 65 | $for N in range(NR): |
| 66 | $for M in range(0, MR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 67 | vacc${ABC[M:M+4]}n${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}n${N}, vi${ABC[M:M+4]}, vw, ${N}); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 68 | } while (--nnz != 0); |
| 69 | } |
| 70 | $for N in range(0, NR, 1): |
| 71 | $for M in range(0, MR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 72 | float32x4_t vout${ABC[M:M+4]}n${N} = vminq_f32(vacc${ABC[M:M+4]}n${N}, vmax); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 73 | |
| 74 | $for N in range(0, NR, 1): |
| 75 | $for M in range(0, MR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 76 | vout${ABC[M:M+4]}n${N} = vmaxq_f32(vout${ABC[M:M+4]}n${N}, vmin); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 77 | |
| 78 | $for N in range(0, NR, 1): |
| 79 | $for M in range(0, MR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 80 | vst1q_f32(output + ${M}, vout${ABC[M:M+4]}n${N}); |
| 81 | output = (float*restrict) ((uintptr_t) output + output_stride); |
| 82 | n -= ${NR}; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 83 | } |
| 84 | |
| 85 | // clean up loop, fall back to nr=1 |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 86 | if XNN_UNLIKELY(n != 0) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 87 | do { |
| 88 | uint32_t nnz = *nnzmap++; |
| 89 | float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1; |
| 90 | $for M in range(4, MR, 4): |
| 91 | float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]}; |
| 92 | if XNN_LIKELY(nnz != 0) { |
| 93 | do { |
| 94 | const intptr_t diff = *dmap++; |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 95 | const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 96 | $for M in range(4, MR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 97 | const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); |
| 98 | input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); |
Frank Barchard | 2da0de8 | 2020-11-09 22:42:11 -0800 | [diff] [blame] | 99 | $for M in range(0, MR, 16): |
| 100 | __builtin_prefetch(input + ${M+16}); |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 101 | const float32x4_t vw = vld1q_dup_f32(w); w += 1; |
Frank Barchard | 2da0de8 | 2020-11-09 22:42:11 -0800 | [diff] [blame] | 102 | __builtin_prefetch(w + 32); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 103 | $for M in range(0, MR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 104 | vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 105 | } while (--nnz != 0); |
| 106 | } |
| 107 | $for M in range(0, MR, 4): |
Frank Barchard | fcfdc0e | 2019-10-21 15:58:42 -0700 | [diff] [blame] | 108 | float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 109 | |
| 110 | $for M in range(0, MR, 4): |
Frank Barchard | fcfdc0e | 2019-10-21 15:58:42 -0700 | [diff] [blame] | 111 | vout${ABC[M:M+4]} = vmaxq_f32(vout${ABC[M:M+4]}, vmin); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 112 | |
| 113 | $for M in range(0, MR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 114 | vst1q_f32(output + ${M}, vout${ABC[M:M+4]}); |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 115 | output = (float*restrict) ((uintptr_t) output + output_stride); |
| 116 | n -= 1; |
| 117 | } while (n != 0); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 118 | } |
Marat Dukhan | e278a55 | 2020-11-14 16:14:58 -0800 | [diff] [blame] | 119 | output = (float*restrict) ((uintptr_t) output - output_decrement); |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 120 | input += ${MR}; |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 121 | mc -= ${MR} * sizeof(float); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 122 | } |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 123 | if XNN_UNLIKELY(mc != 0) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 124 | $for LOG2M in reversed(range((MR - 1).bit_length())): |
| 125 | $SUBMR = 1 << LOG2M |
Marat Dukhan | e278a55 | 2020-11-14 16:14:58 -0800 | [diff] [blame] | 126 | $if SUBMR * 2 >= MR: |
| 127 | output_decrement += ${MR - SUBMR} * sizeof(float); |
| 128 | $else: |
| 129 | output_decrement += ${SUBMR} * sizeof(float); |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 130 | if (mc & (${SUBMR} * sizeof(float))) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 131 | const float*restrict w = weights; |
| 132 | const int32_t* dmap = widx_dmap; |
| 133 | const uint32_t* nnzmap = nidx_nnzmap; |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 134 | size_t n = nc; |
| 135 | while (n >= ${NR}) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 136 | uint32_t nnz = *nnzmap++; |
| 137 | $for N in range(0, NR, 1): |
| 138 | $if SUBMR < 4: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 139 | float32x2_t vacc${ABC[0:SUBMR]}n${N} = vld1_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 140 | $else: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 141 | float32x4_t vacc${ABC[0:4]}n${N} = vld1q_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 142 | $for M in range(4, SUBMR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 143 | float32x4_t vacc${ABC[M:M+4]}n${N} = vacc${ABC[0:4]}n${N}; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 144 | if XNN_LIKELY(nnz != 0) { |
| 145 | do { |
| 146 | const intptr_t diff = *dmap++; |
| 147 | $if SUBMR == 1: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 148 | const float32x2_t vi${ABC[0]} = vld1_dup_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 149 | $elif SUBMR == 2: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 150 | const float32x2_t vi${ABC[0:2]} = vld1_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 151 | $else: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 152 | const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 153 | $for M in range(4, SUBMR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 154 | const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); |
| 155 | input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 156 | $if NR == 1: |
| 157 | $if SUBMR < 4: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 158 | const float32x2_t vw = vld1_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 159 | $else: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 160 | const float32x4_t vw = vld1q_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 161 | $elif NR == 2: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 162 | const float32x2_t vw = vld1_f32(w); w += 2; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 163 | $elif NR == 4: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 164 | const float32x4_t vw = vld1q_f32(w); w += 4; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 165 | |
| 166 | $if NR == 1: |
| 167 | $if SUBMR < 4: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 168 | vacc${ABC[0:SUBMR]}c0 = vfmaq_f32(vacc${ABC[0:SUBMR]}c0, vi${ABC[0:SUBMR]}, vw); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 169 | $else: |
| 170 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 171 | vacc${ABC[M:M+4]}c0 = vfmaq_f32(vacc${ABC[M:M+4]}c0, vi${ABC[M:M+4]}, vw); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 172 | $else: |
| 173 | $for N in range(NR): |
| 174 | $if SUBMR < 4: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 175 | vacc${ABC[0:SUBMR]}n${N} = vfma_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[0:SUBMR]}n${N}, vi${ABC[0:SUBMR]}, vw, ${N}); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 176 | $else: |
| 177 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 178 | vacc${ABC[M:M+4]}n${N} = vfmaq_lane${"q" if NR == 4 else ""}_f32(vacc${ABC[M:M+4]}n${N}, vi${ABC[M:M+4]}, vw, ${N}); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 179 | } while (--nnz != 0); |
| 180 | } |
| 181 | $for N in range(0, NR, 1): |
| 182 | $if SUBMR < 4: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 183 | float32x2_t vout${ABC[0:SUBMR]}n${N} = vmin_f32(vacc${ABC[0:SUBMR]}n${N}, vget_low_f32(vmax)); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 184 | $else: |
| 185 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 186 | float32x4_t vout${ABC[M:M+4]}n${N} = vminq_f32(vacc${ABC[M:M+4]}n${N}, vmax); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 187 | |
| 188 | $for N in range(0, NR, 1): |
| 189 | $if SUBMR < 4: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 190 | vout${ABC[0:SUBMR]}n${N} = vmax_f32(vout${ABC[0:SUBMR]}n${N}, vget_low_f32(vmin)); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 191 | $else: |
| 192 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 193 | vout${ABC[M:M+4]}n${N} = vmaxq_f32(vout${ABC[M:M+4]}n${N}, vmin); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 194 | |
Marat Dukhan | e278a55 | 2020-11-14 16:14:58 -0800 | [diff] [blame] | 195 | $for N in range(NR): |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 196 | $if SUBMR == 1: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 197 | vst1_lane_f32(output + ${M}, vout${ABC[0:SUBMR]}n${N}, 0); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 198 | $elif SUBMR == 2: |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 199 | vst1_f32(output + ${M}, vout${ABC[0:SUBMR]}n${N}); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 200 | $else: |
| 201 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 202 | vst1q_f32(output + ${M}, vout${ABC[M:M+4]}n${N}); |
| 203 | output = (float*restrict) ((uintptr_t) output + output_stride); |
| 204 | n -= ${NR}; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 205 | } |
| 206 | |
| 207 | // clean up loop, fall back to nr=1 |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 208 | if XNN_UNLIKELY(n != 0) { |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 209 | do { |
| 210 | uint32_t nnz = *nnzmap++; |
| 211 | $if SUBMR < 4: |
| 212 | float32x2_t vacc${ABC[0:SUBMR]} = vld1_dup_f32(w); w += 1; |
| 213 | $else: |
| 214 | float32x4_t vacc${ABC[0:4]} = vld1q_dup_f32(w); w += 1; |
| 215 | $for M in range(4, SUBMR, 4): |
| 216 | float32x4_t vacc${ABC[M:M+4]} = vacc${ABC[0:4]}; |
| 217 | if XNN_LIKELY(nnz != 0) { |
| 218 | do { |
| 219 | const intptr_t diff = *dmap++; |
| 220 | $if SUBMR == 1: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 221 | const float32x2_t vi${ABC[0:1]} = vld1_dup_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 222 | $elif SUBMR == 2: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 223 | const float32x2_t vi${ABC[0:2]} = vld1_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 224 | $else: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 225 | const float32x4_t vi${ABC[0:4]} = vld1q_f32(input); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 226 | $for M in range(4, SUBMR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 227 | const float32x4_t vi${ABC[M:M+4]} = vld1q_f32(input + ${M}); |
| 228 | input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 229 | $if SUBMR < 4: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 230 | const float32x2_t vw = vld1_dup_f32(w); w += 1; |
| 231 | vacc${ABC[0:SUBMR]} = vfma_f32(vacc${ABC[0:SUBMR]}, vi${ABC[0:SUBMR]}, vw); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 232 | $else: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 233 | const float32x4_t vw = vld1q_dup_f32(w); w += 1; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 234 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 235 | vacc${ABC[M:M+4]} = vfmaq_f32(vacc${ABC[M:M+4]}, vi${ABC[M:M+4]}, vw); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 236 | } while (--nnz != 0); |
| 237 | } |
| 238 | $if SUBMR < 4: |
Frank Barchard | fcfdc0e | 2019-10-21 15:58:42 -0700 | [diff] [blame] | 239 | float32x2_t vout${ABC[0:SUBMR]} = vmin_f32(vacc${ABC[0:SUBMR]}, vget_low_f32(vmax)); |
| 240 | vout${ABC[0:SUBMR]} = vmax_f32(vout${ABC[0:SUBMR]}, vget_low_f32(vmin)); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 241 | $else: |
| 242 | $for M in range(0, SUBMR, 4): |
Frank Barchard | fcfdc0e | 2019-10-21 15:58:42 -0700 | [diff] [blame] | 243 | float32x4_t vout${ABC[M:M+4]} = vminq_f32(vacc${ABC[M:M+4]}, vmax); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 244 | |
| 245 | $for M in range(0, SUBMR, 4): |
Frank Barchard | fcfdc0e | 2019-10-21 15:58:42 -0700 | [diff] [blame] | 246 | vout${ABC[M:M+4]} = vmaxq_f32(vout${ABC[M:M+4]}, vmin); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 247 | |
| 248 | $if SUBMR == 1: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 249 | vst1_lane_f32(output, vout${ABC[0:1]}, 1); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 250 | $elif SUBMR == 2: |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 251 | vst1_f32(output, vout${ABC[0:2]}); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 252 | $else: |
| 253 | $for M in range(0, SUBMR, 4): |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 254 | vst1q_f32(output + ${M}, vout${ABC[M:M+4]}); |
Marat Dukhan | e8bfcc8 | 2020-11-16 12:28:13 -0800 | [diff] [blame] | 255 | output = (float*restrict) ((uintptr_t) output + output_stride); |
| 256 | n -= 1; |
| 257 | } while (n != 0); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 258 | } |
Marat Dukhan | e278a55 | 2020-11-14 16:14:58 -0800 | [diff] [blame] | 259 | output = (float*restrict) ((uintptr_t) output - output_decrement); |
Marat Dukhan | 1530116 | 2020-07-23 21:30:50 -0700 | [diff] [blame] | 260 | input += ${SUBMR}; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 261 | } |
| 262 | } |
| 263 | } |