blob: b6d53ed24d92307f06bfd18f1b18d1902d108b19 [file] [log] [blame]
Marat Dukhan40bbafe2020-08-04 02:04:22 -07001// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
7$assert NR % 8 == 0
8$assert 8 <= NR <= 16
Frank Barchard22fbe772021-07-20 15:56:32 -07009$assert REQUANTIZATION in ["FP32", "GEMMLOWP", "RNDNU"]
Marat Dukhan69c8a292021-07-14 19:34:56 -070010$assert DATATYPE in ["QC8", "QS8", "QU8"]
Marat Dukhanf6f62092021-07-14 16:41:39 -070011$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
Marat Dukhan40bbafe2020-08-04 02:04:22 -070012#include <assert.h>
13
14#include <arm_neon.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/igemm.h>
Marat Dukhane76478b2021-06-28 16:35:40 -070018$if REQUANTIZATION == "FP32" and ARMV8:
19 #include <xnnpack/intrinsics-polyfill.h>
Marat Dukhan40bbafe2020-08-04 02:04:22 -070020
21
Marat Dukhan69c8a292021-07-14 19:34:56 -070022$PARAMS_UNION = "xnn_qs8_minmax_params" if DATATYPE == "QC8" else "xnn_%s_conv_minmax_params" % DATATYPE.lower()
Marat Dukhanf6f62092021-07-14 16:41:39 -070023$PARAMS_STRUCT = ("" if DATATYPE == "QC8" else REQUANTIZATION.lower() + "_") + ("neonv8" if ARMV8 and DATATYPE != "QC8" else "neon")
24$if REQUANTIZATION == "FP32" and DATATYPE == "QC8" and not ARMV8:
Marat Dukhane76478b2021-06-28 16:35:40 -070025 $PARAMS_STRUCT = "neon_fp32"
Marat Dukhan69c8a292021-07-14 19:34:56 -070026$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
27$XINT8X8_T = "uint8x8_t" if DATATYPE == "QU8" else "int8x8_t"
28$XINT8X16_T = "uint8x16_t" if DATATYPE == "QU8" else "int8x16_t"
29$VGET_LOW_X8 = "vget_low_u8" if DATATYPE == "QU8" else "vget_low_s8"
30$VGET_HIGH_X8 = "vget_high_u8" if DATATYPE == "QU8" else "vget_high_s8"
31$VCOMBINE_X8 = "vcombine_u8" if DATATYPE == "QU8" else "vcombine_s8"
32$VREINTERPRET_U32_X8 = "vreinterpret_u32_u8" if DATATYPE == "QU8" else "vreinterpret_u32_s8"
33$VREINTERPRETQ_U32_X8 = "vreinterpretq_u32_u8" if DATATYPE == "QU8" else "vreinterpretq_u32_s8"
34$VREINTERPRET_U16_X8 = "vreinterpret_u16_u8" if DATATYPE == "QU8" else "vreinterpret_u16_s8"
35$VREINTERPRETQ_U16_X8 = "vreinterpretq_u16_u8" if DATATYPE == "QU8" else "vreinterpretq_u16_s8"
36$VREINTERPRETQ_X8_S16 = "vreinterpretq_u8_s16" if DATATYPE == "QU8" else "vreinterpretq_s8_s16"
37$VLD1_X8 = "vld1_u8" if DATATYPE == "QU8" else "vld1_s8"
38$VLD1_DUP_X8 = "vld1_dup_u8" if DATATYPE == "QU8" else "vld1_dup_s8"
39$VLD1Q_DUP_X8 = "vld1q_dup_u8" if DATATYPE == "QU8" else "vld1q_dup_s8"
40$VST1_X8 = "vst1_u8" if DATATYPE == "QU8" else "vst1_s8"
41$VST1Q_X8 = "vst1q_u8" if DATATYPE == "QU8" else "vst1q_s8"
42$VST1_LANE_X8 = "vst1_lane_u8" if DATATYPE == "QU8" else "vst1_lane_s8"
43$VST1Q_LANE_X8 = "vst1q_lane_u8" if DATATYPE == "QU8" else "vst1q_lane_s8"
44$VMIN_X8 = "vmin_u8" if DATATYPE == "QU8" else "vmin_s8"
45$VMAX_X8 = "vmax_u8" if DATATYPE == "QU8" else "vmax_s8"
46$VMINQ_X8 = "vminq_u8" if DATATYPE == "QU8" else "vminq_s8"
47$VMAXQ_X8 = "vmaxq_u8" if DATATYPE == "QU8" else "vmaxq_s8"
48$VEXT_X8 = "vext_u8" if DATATYPE == "QU8" else "vext_s8"
49$VEXTQ_X8 = "vextq_u8" if DATATYPE == "QU8" else "vextq_s8"
50$VQMOVXN_S16 = "vqmovun_s16" if DATATYPE == "QU8" else "vqmovn_s16"
51$VQMOVXN_HIGH_S16 = "vqmovun_high_s16" if DATATYPE == "QU8" else "vqmovn_high_s16"
52$VMOVN_X16 = "vmovn_u16" if DATATYPE == "QU8" else "vmovn_s16"
53$VUZP1Q_X8 = "vuzp1q_u8" if DATATYPE == "QU8" else "vuzp1q_s8"
Marat Dukhancf055852021-06-26 09:05:09 -070054$ISA = "neonv8" if ARMV8 else "neon"
Marat Dukhanf6f62092021-07-14 16:41:39 -070055void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}__${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}(
Marat Dukhan40bbafe2020-08-04 02:04:22 -070056 size_t mr,
57 size_t nc,
58 size_t kc,
59 size_t ks,
Marat Dukhan69c8a292021-07-14 19:34:56 -070060 const ${XINT8_T}** restrict a,
Marat Dukhan40bbafe2020-08-04 02:04:22 -070061 const void* restrict w,
Marat Dukhan69c8a292021-07-14 19:34:56 -070062 ${XINT8_T}* restrict c,
Marat Dukhan40bbafe2020-08-04 02:04:22 -070063 size_t cm_stride,
64 size_t cn_stride,
65 size_t a_offset,
Marat Dukhan69c8a292021-07-14 19:34:56 -070066 const ${XINT8_T}* zero,
Marat Dukhan9ca23332021-07-01 14:30:46 -070067 const union ${PARAMS_UNION} params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
Marat Dukhan40bbafe2020-08-04 02:04:22 -070068{
69 assert(mr != 0);
70 assert(mr <= ${MR});
71 assert(nc != 0);
72 assert(kc != 0);
73 assert(ks != 0);
74 assert(ks % (${MR} * sizeof(void*)) == 0);
Marat Dukhan69c8a292021-07-14 19:34:56 -070075 assert(a_offset % sizeof(${XINT8_T}) == 0);
Marat Dukhan40bbafe2020-08-04 02:04:22 -070076 assert(a != NULL);
77 assert(w != NULL);
78 assert(c != NULL);
79
Marat Dukhan69c8a292021-07-14 19:34:56 -070080 ${XINT8_T}* c0 = c;
Marat Dukhan40bbafe2020-08-04 02:04:22 -070081 $for M in range(1, MR):
Marat Dukhan69c8a292021-07-14 19:34:56 -070082 ${XINT8_T}* c${M} = (${XINT8_T}*) ((uintptr_t) c${M-1} + cm_stride);
Marat Dukhan40bbafe2020-08-04 02:04:22 -070083 $if M % 2 == 0:
84 if XNN_UNPREDICTABLE(mr <= ${M}) {
85 c${M} = c${M-1};
86 }
87 $elif M + 1 == MR:
88 if XNN_UNPREDICTABLE(mr != ${M+1}) {
89 c${M} = c${M-1};
90 }
91 $else:
92 if XNN_UNPREDICTABLE(mr < ${M+1}) {
93 c${M} = c${M-1};
94 }
95
Marat Dukhan69c8a292021-07-14 19:34:56 -070096 $if DATATYPE == "QU8":
Frank Barchard86a16182021-08-02 13:33:43 -070097 const uint8x8_t vb_zero_point = vld1_dup_u8(&params->${PARAMS_STRUCT}.kernel_zero_point[0]);
Marat Dukhan40bbafe2020-08-04 02:04:22 -070098 do {
99 $for N in range(0, NR, 4):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700100 int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700101 $for M in range(1, MR):
102 $for N in range(0, NR, 4):
103 int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
104
105 size_t p = ks;
106 do {
107 $for M in range(MR):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700108 const ${XINT8_T}* restrict a${M} = a[${M}];
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700109 if XNN_UNPREDICTABLE(a${M} != zero) {
Marat Dukhan69c8a292021-07-14 19:34:56 -0700110 a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + a_offset);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700111 }
112 a += ${MR};
113
114 size_t k = kc;
Marat Dukhan69c8a292021-07-14 19:34:56 -0700115 while (k >= 8 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700116 $for M in range(MR):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700117 const ${XINT8X8_T} va${M} = ${VLD1_X8}(a${M}); a${M} += 8;
118 $if DATATYPE == "QU8":
119 const int16x8_t vxa${M} = vreinterpretq_s16_u16(vmovl_u8(va${M}));
120 $else:
121 const int16x8_t vxa${M} = vmovl_s8(va${M});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700122
123 $for K in range(4):
124 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700125 const ${XINT8X8_T} vb${ABC[N:N+8]}c${K} = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
126 $if DATATYPE == "QU8":
127 const int16x8_t vxb${ABC[N:N+8]}c${K} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c${K}, vb_zero_point));
128 $else:
129 const int16x8_t vxb${ABC[N:N+8]}c${K} = vmovl_s8(vb${ABC[N:N+8]}c${K});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700130
131 $for M in range(MR):
132 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c${K}), vget_low_s16(vxa${M}), ${K});
133 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c${K}), vget_low_s16(vxa${M}), ${K});
134
Frank Barchard1f51d382021-04-20 14:07:14 -0700135 $if PREFETCH:
136 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700137 __builtin_prefetch((const ${XINT8_T}*) w + ${N * 8 + 480});
Frank Barchard1f51d382021-04-20 14:07:14 -0700138
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700139 $for K in range(4, 8):
140 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700141 const ${XINT8X8_T} vb${ABC[N:N+8]}c${K} = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
142 $if DATATYPE == "QU8":
143 const int16x8_t vxb${ABC[N:N+8]}c${K} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c${K}, vb_zero_point));
144 $else:
145 const int16x8_t vxb${ABC[N:N+8]}c${K} = vmovl_s8(vb${ABC[N:N+8]}c${K});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700146
147 $for M in range(MR):
148 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c${K}), vget_high_s16(vxa${M}), ${K-4});
149 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c${K}), vget_high_s16(vxa${M}), ${K-4});
150
Marat Dukhan69c8a292021-07-14 19:34:56 -0700151 k -= 8 * sizeof(${XINT8_T});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700152 }
153 if XNN_UNLIKELY(k != 0) {
154 $for M in range(MR):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700155 const ${XINT8X8_T} va${M} = ${VLD1_X8}(a${M}); a${M} = (const ${XINT8_T}*) ((uintptr_t) a${M} + k);
156 $if DATATYPE == "QU8":
157 const int16x8_t vxa${M} = vreinterpretq_s16_u16(vmovl_u8(va${M}));
158 $else:
159 const int16x8_t vxa${M} = vmovl_s8(va${M});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700160
161 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700162 const ${XINT8X8_T} vb${ABC[N:N+8]}c0 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
163 $if DATATYPE == "QU8":
164 const int16x8_t vxb${ABC[N:N+8]}c0 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c0, vb_zero_point));
165 $else:
166 const int16x8_t vxb${ABC[N:N+8]}c0 = vmovl_s8(vb${ABC[N:N+8]}c0);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700167
168 $for M in range(MR):
169 $for N in range(0, NR, 8):
170 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c0), vget_low_s16(vxa${M}), 0);
171 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c0), vget_low_s16(vxa${M}), 0);
172
Marat Dukhan69c8a292021-07-14 19:34:56 -0700173 if (k >= 2 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700174 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700175 const ${XINT8X8_T} vb${ABC[N:N+8]}c1 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
176 $if DATATYPE == "QU8":
177 const int16x8_t vxb${ABC[N:N+8]}c1 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c1, vb_zero_point));
178 $else:
179 const int16x8_t vxb${ABC[N:N+8]}c1 = vmovl_s8(vb${ABC[N:N+8]}c1);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700180
181 $for M in range(MR):
182 $for N in range(0, NR, 8):
183 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c1), vget_low_s16(vxa${M}), 1);
184 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c1), vget_low_s16(vxa${M}), 1);
185
Marat Dukhan69c8a292021-07-14 19:34:56 -0700186 if (k > 2 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700187 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700188 const ${XINT8X8_T} vb${ABC[N:N+8]}c2 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
189 $if DATATYPE == "QU8":
190 const int16x8_t vxb${ABC[N:N+8]}c2 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c2, vb_zero_point));
191 $else:
192 const int16x8_t vxb${ABC[N:N+8]}c2 = vmovl_s8(vb${ABC[N:N+8]}c2);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700193
194 $for M in range(MR):
195 $for N in range(0, NR, 8):
196 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c2), vget_low_s16(vxa${M}), 2);
197 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c2), vget_low_s16(vxa${M}), 2);
198
Marat Dukhan69c8a292021-07-14 19:34:56 -0700199 if (k >= 4 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700200 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700201 const ${XINT8X8_T} vb${ABC[N:N+8]}c3 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
202 $if DATATYPE == "QU8":
203 const int16x8_t vxb${ABC[N:N+8]}c3 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c3, vb_zero_point));
204 $else:
205 const int16x8_t vxb${ABC[N:N+8]}c3 = vmovl_s8(vb${ABC[N:N+8]}c3);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700206
207 $for M in range(MR):
208 $for N in range(0, NR, 8):
209 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c3), vget_low_s16(vxa${M}), 3);
210 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c3), vget_low_s16(vxa${M}), 3);
211
Marat Dukhan69c8a292021-07-14 19:34:56 -0700212 if (k > 4 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700213 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700214 const ${XINT8X8_T} vb${ABC[N:N+8]}c4 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
215 $if DATATYPE == "QU8":
216 const int16x8_t vxb${ABC[N:N+8]}c4 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c4, vb_zero_point));
217 $else:
218 const int16x8_t vxb${ABC[N:N+8]}c4 = vmovl_s8(vb${ABC[N:N+8]}c4);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700219
220 $for M in range(MR):
221 $for N in range(0, NR, 8):
222 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c4), vget_high_s16(vxa${M}), 0);
223 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c4), vget_high_s16(vxa${M}), 0);
224
Marat Dukhan69c8a292021-07-14 19:34:56 -0700225 if (k >= 6 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700226 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700227 const ${XINT8X8_T} vb${ABC[N:N+8]}c5 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
228 $if DATATYPE == "QU8":
229 const int16x8_t vxb${ABC[N:N+8]}c5 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c5, vb_zero_point));
230 $else:
231 const int16x8_t vxb${ABC[N:N+8]}c5 = vmovl_s8(vb${ABC[N:N+8]}c5);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700232
233 $for M in range(MR):
234 $for N in range(0, NR, 8):
235 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c5), vget_high_s16(vxa${M}), 1);
236 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c5), vget_high_s16(vxa${M}), 1);
237
Marat Dukhan69c8a292021-07-14 19:34:56 -0700238 if (k > 6 * sizeof(${XINT8_T})) {
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700239 $for N in range(0, NR, 8):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700240 const ${XINT8X8_T} vb${ABC[N:N+8]}c6 = ${VLD1_X8}(w); w = (const void*) ((const ${XINT8_T}*) w + 8);
241 $if DATATYPE == "QU8":
242 const int16x8_t vxb${ABC[N:N+8]}c6 = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}c6, vb_zero_point));
243 $else:
244 const int16x8_t vxb${ABC[N:N+8]}c6 = vmovl_s8(vb${ABC[N:N+8]}c6);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700245
246 $for M in range(MR):
247 $for N in range(0, NR, 8):
248 vacc${M}x${ABC[N:N+4]} = vmlal_lane_s16(vacc${M}x${ABC[N:N+4]}, vget_low_s16(vxb${ABC[N:N+8]}c6), vget_high_s16(vxa${M}), 2);
249 vacc${M}x${ABC[N+4:N+8]} = vmlal_lane_s16(vacc${M}x${ABC[N+4:N+8]}, vget_high_s16(vxb${ABC[N:N+8]}c6), vget_high_s16(vxa${M}), 2);
250 }
251 }
252 }
253 }
254 }
255 }
256 }
257 p -= ${MR} * sizeof(void*);
258 } while (p != 0);
259
Frank Barchard22fbe772021-07-20 15:56:32 -0700260 // Post-accumulation work
Marat Dukhancf055852021-06-26 09:05:09 -0700261 $if REQUANTIZATION == "GEMMLOWP":
262 const int32x4_t vmultiplier = vld1q_dup_s32(&params->${PARAMS_STRUCT}.multiplier);
Frank Barchard26e83782021-07-26 15:14:14 -0700263 $for M in range(MR):
264 $for N in range(0, NR, 4):
265 vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
266
Marat Dukhancf055852021-06-26 09:05:09 -0700267 const int32x4_t vright_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_shift);
268 const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
269 $for M in range(MR):
270 $for N in range(0, NR, 4):
Frank Barchard26e83782021-07-26 15:14:14 -0700271 vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700272
Marat Dukhancf055852021-06-26 09:05:09 -0700273 $for M in range(MR):
274 $for N in range(0, NR, 4):
275 vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
Frank Barchard22fbe772021-07-20 15:56:32 -0700276 $elif REQUANTIZATION == "RNDNU":
277 const int32x4_t vright_pre_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_pre_shift);
278 const int32x4_t vmultiplier = vld1q_dup_s32(&params->${PARAMS_STRUCT}.multiplier);
279 const int32x4_t vright_post_shift = vld1q_dup_s32(&params->${PARAMS_STRUCT}.right_post_shift);
280
281 $for M in range(MR):
282 $for N in range(0, NR, 4):
283 vacc${M}x${ABC[N:N+4]} = vshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_pre_shift);
284
285 $for M in range(MR):
286 $for N in range(0, NR, 4):
287 vacc${M}x${ABC[N:N+4]} = vqdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
288
289 $for M in range(MR):
290 $for N in range(0, NR, 4):
291 vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_post_shift);
Marat Dukhancf055852021-06-26 09:05:09 -0700292 $elif REQUANTIZATION == "FP32":
293 $for M in range(MR):
294 $for N in range(0, NR, 4):
295 float32x4_t vfpacc${M}x${ABC[N:N+4]} = vcvtq_f32_s32(vacc${M}x${ABC[N:N+4]});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700296
Marat Dukhanf6f62092021-07-14 16:41:39 -0700297 $if DATATYPE == "QC8":
Marat Dukhancf055852021-06-26 09:05:09 -0700298 $for N in range(0, NR, 4):
Marat Dukhane76478b2021-06-28 16:35:40 -0700299 const float32x4_t vscale${ABC[N:N+4]} = vld1q_f32((const float*) w); w = (const void*) ((const float*) w + 4);
300 $for M in range(MR):
301 vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale${ABC[N:N+4]});
302 $else:
303 const float32x4_t vscale = vld1q_dup_f32(&params->${PARAMS_STRUCT}.scale);
304 $for M in range(MR):
305 $for N in range(0, NR, 4):
306 vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale);
Marat Dukhancf055852021-06-26 09:05:09 -0700307
308 $if ARMV8:
309 $for M in range(MR):
310 $for N in range(0, NR, 4):
311 vacc${M}x${ABC[N:N+4]} = vcvtnq_s32_f32(vfpacc${M}x${ABC[N:N+4]});
312 $else:
313 const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(&params->${PARAMS_STRUCT}.output_min_less_zero_point);
314 $for M in range(MR):
315 $for N in range(0, NR, 4):
316 vfpacc${M}x${ABC[N:N+4]} = vmaxq_f32(vfpacc${M}x${ABC[N:N+4]}, voutput_min_less_zero_point);
317
318 const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(&params->${PARAMS_STRUCT}.output_max_less_zero_point);
319 $for M in range(MR):
320 $for N in range(0, NR, 4):
321 vfpacc${M}x${ABC[N:N+4]} = vminq_f32(vfpacc${M}x${ABC[N:N+4]}, voutput_max_less_zero_point);
322
323 const float32x4_t vmagic_bias = vld1q_dup_f32(&params->${PARAMS_STRUCT}.magic_bias);
324 $for M in range(MR):
325 $for N in range(0, NR, 4):
326 vacc${M}x${ABC[N:N+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${M}x${ABC[N:N+4]}, vmagic_bias));
327
328 const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(&params->${PARAMS_STRUCT}.magic_bias_less_zero_point);
329 $for M in range(MR):
330 $for N in range(0, NR, 4):
331 vacc${M}x${ABC[N:N+4]} = vsubq_s32(vacc${M}x${ABC[N:N+4]}, vmagic_bias_less_zero_point);
332
333 $if REQUANTIZATION != "FP32" or ARMV8:
334 const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->${PARAMS_STRUCT}.output_zero_point);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700335#if XNN_ARCH_ARM64
Marat Dukhancf055852021-06-26 09:05:09 -0700336 $if REQUANTIZATION == "FP32" and not ARMV8:
337 $for M in range(MR):
338 $for N in range(0, NR, 8):
Marat Dukhanaef90912021-06-28 09:40:17 -0700339 const int16x8_t vacc${M}x${ABC[N:N+8]} = vuzp1q_s16(vreinterpretq_s16_s32(vacc${M}x${ABC[N:N+4]}), vreinterpretq_s16_s32(vacc${M}x${ABC[N+4:N+8]}));
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700340
Marat Dukhancf055852021-06-26 09:05:09 -0700341 $for M in range(MR):
342 $for N in range(0, NR, 16):
343 $if N + 8 < NR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700344 ${XINT8X16_T} vout${M}x${ABC[N:N+16]} = ${VUZP1Q_X8}(${VREINTERPRETQ_X8_S16}(vacc${M}x${ABC[N:N+8]}), ${VREINTERPRETQ_X8_S16}(vacc${M}x${ABC[N+8:N+16]}));
Marat Dukhancf055852021-06-26 09:05:09 -0700345 $elif M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700346 ${XINT8X16_T} vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = ${VUZP1Q_X8}(${VREINTERPRETQ_X8_S16}(vacc${M-1}x${ABC[N:N+8]}), ${VREINTERPRETQ_X8_S16}(vacc${M}x${ABC[N:N+8]}));
Marat Dukhancf055852021-06-26 09:05:09 -0700347 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700348 ${XINT8X8_T} vout${M}x${ABC[N:N+8]} = ${VMOVN_X16}(vacc${M}x${ABC[N:N+8]});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700349 $else:
Marat Dukhancf055852021-06-26 09:05:09 -0700350 $for M in range(MR):
351 $for N in range(0, NR, 8):
352 const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700353
Marat Dukhancf055852021-06-26 09:05:09 -0700354 $for M in range(MR):
355 $for N in range(0, NR, 16):
356 $if N + 8 < NR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700357 ${XINT8X16_T} vout${M}x${ABC[N:N+16]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
Marat Dukhancf055852021-06-26 09:05:09 -0700358 $elif M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700359 ${XINT8X16_T} vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = ${VQMOVXN_HIGH_S16}(${VQMOVXN_S16}(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
Marat Dukhancf055852021-06-26 09:05:09 -0700360 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700361 ${XINT8X8_T} vout${M}x${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${M}x${ABC[N:N+8]});
Marat Dukhancf055852021-06-26 09:05:09 -0700362#else
363 $if REQUANTIZATION == "FP32" and not ARMV8:
364 $for M in range(MR):
365 $for N in range(0, NR, 8):
366 const int16x8_t vacc${M}x${ABC[N:N+8]} = vcombine_s16(vmovn_s32(vacc${M}x${ABC[N:N+4]}), vmovn_s32(vacc${M}x${ABC[N+4:N+8]}));
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700367
Marat Dukhan69c8a292021-07-14 19:34:56 -0700368 $if DATATYPE == "QU8":
369 $for M in range(MR):
370 $for N in range(0, NR, 16):
371 $if N + 8 < NR:
372 uint8x16_t vout${M}x${ABC[N:N+16]} = vreinterpretq_u8_s8(vcombine_s8(vmovn_s16(vacc${M}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N+8:N+16]})));
373 $elif M % 2 == 1:
374 uint8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vreinterpretq_u8_s8(vcombine_s8(vmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N:N+8]})));
375 $elif M + 1 == MR:
376 uint8x8_t vout${M}x${ABC[N:N+8]} = vreinterpret_u8_s8(vmovn_s16(vacc${M}x${ABC[N:N+8]}));
377 $else:
378 $for M in range(MR):
379 $for N in range(0, NR, 16):
380 $if N + 8 < NR:
381 int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vmovn_s16(vacc${M}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
382 $elif M % 2 == 1:
383 int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N:N+8]}));
384 $elif M + 1 == MR:
385 int8x8_t vout${M}x${ABC[N:N+8]} = vmovn_s16(vacc${M}x${ABC[N:N+8]});
Marat Dukhancf055852021-06-26 09:05:09 -0700386 $else:
387 $for M in range(MR):
388 $for N in range(0, NR, 8):
389 const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
390
391 $for M in range(MR):
392 $for N in range(0, NR, 16):
393 $if N + 8 < NR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700394 ${XINT8X16_T} vout${M}x${ABC[N:N+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${M}x${ABC[N:N+8]}), ${VQMOVXN_S16}(vacc${M}x${ABC[N+8:N+16]}));
Marat Dukhancf055852021-06-26 09:05:09 -0700395 $elif M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700396 ${XINT8X16_T} vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${M-1}x${ABC[N:N+8]}), ${VQMOVXN_S16}(vacc${M}x${ABC[N:N+8]}));
Marat Dukhancf055852021-06-26 09:05:09 -0700397 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700398 ${XINT8X8_T} vout${M}x${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${M}x${ABC[N:N+8]});
Marat Dukhancf055852021-06-26 09:05:09 -0700399#endif
400 $if REQUANTIZATION != "FP32" or ARMV8:
401 $if NR == 8 and MR == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700402 const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(&params->${PARAMS_STRUCT}.output_min);
403 const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(&params->${PARAMS_STRUCT}.output_max);
Marat Dukhancf055852021-06-26 09:05:09 -0700404 $else:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700405 const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(&params->${PARAMS_STRUCT}.output_min);
406 const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(&params->${PARAMS_STRUCT}.output_max);
Marat Dukhancf055852021-06-26 09:05:09 -0700407
Marat Dukhan69c8a292021-07-14 19:34:56 -0700408 $for M in range(MR):
Marat Dukhancf055852021-06-26 09:05:09 -0700409 $for N in range(0, NR, 16):
410 $if N + 8 < NR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700411 vout${M}x${ABC[N:N+16]} = ${VMAXQ_X8}(vout${M}x${ABC[N:N+16]}, voutput_min);
Marat Dukhancf055852021-06-26 09:05:09 -0700412 $elif M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700413 vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = ${VMAXQ_X8}(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
Marat Dukhancf055852021-06-26 09:05:09 -0700414 $elif M + 1 == MR:
415 $if NR == 8 and MR == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700416 vout${M}x${ABC[N:N+8]} = ${VMAX_X8}(vout${M}x${ABC[N:N+8]}, voutput_min);
Marat Dukhancf055852021-06-26 09:05:09 -0700417 $else:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700418 vout${M}x${ABC[N:N+8]} = ${VMAX_X8}(vout${M}x${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
Marat Dukhancf055852021-06-26 09:05:09 -0700419
Marat Dukhan69c8a292021-07-14 19:34:56 -0700420 $for M in range(MR):
Marat Dukhancf055852021-06-26 09:05:09 -0700421 $for N in range(0, NR, 16):
422 $if N + 8 < NR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700423 vout${M}x${ABC[N:N+16]} = ${VMINQ_X8}(vout${M}x${ABC[N:N+16]}, voutput_max);
Marat Dukhancf055852021-06-26 09:05:09 -0700424 $elif M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700425 vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = ${VMINQ_X8}(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
Marat Dukhancf055852021-06-26 09:05:09 -0700426 $elif M + 1 == MR:
427 $if NR == 8 and MR == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700428 vout${M}x${ABC[N:N+8]} = ${VMIN_X8}(vout${M}x${ABC[N:N+8]}, voutput_max);
Marat Dukhancf055852021-06-26 09:05:09 -0700429 $else:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700430 vout${M}x${ABC[N:N+8]} = ${VMIN_X8}(vout${M}x${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700431
432 if (nc >= ${NR}) {
433 $for M in reversed(range(MR)):
434 $for N in range(0, NR, 16):
435 $if N + 8 < NR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700436 ${VST1Q_X8}(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700437 $elif M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700438 ${VST1_X8}(c${M} + ${N}, ${VGET_HIGH_X8}(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
439 ${VST1_X8}(c${M-1} + ${N}, ${VGET_LOW_X8}(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700440 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700441 ${VST1_X8}(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700442
443 $for M in reversed(range(MR)):
Marat Dukhan69c8a292021-07-14 19:34:56 -0700444 c${M} = (${XINT8_T}*) ((uintptr_t) c${M} + cn_stride);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700445
Marat Dukhan69c8a292021-07-14 19:34:56 -0700446 a = (const ${XINT8_T}**restrict) ((uintptr_t) a - ks);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700447
448 nc -= ${NR};
449 } else {
450 $if NR == 16:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700451 $for M in reversed(range(MR)):
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700452 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700453 ${XINT8X16_T} vout${M-1}x01234567_${M}x01234567 = ${VCOMBINE_X8}(${VGET_LOW_X8}(vout${M-1}x0123456789ABCDEF), ${VGET_LOW_X8}(vout${M}x0123456789ABCDEF));
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700454 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700455 ${XINT8X8_T} vout${M}x01234567 = ${VGET_LOW_X8}(vout${M}x0123456789ABCDEF);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700456 if (nc & 8) {
457 $for M in reversed(range(MR)):
458 $if M % 2 == 1:
Frank Barchard031ff4b2021-09-07 19:08:32 -0700459 ${VST1_X8}(c${M}, ${VGET_HIGH_X8}(vout${M-1}x01234567_${M}x01234567)); c${M} += 8;
460 ${VST1_X8}(c${M-1}, ${VGET_LOW_X8}(vout${M-1}x01234567_${M}x01234567)); c${M-1} += 8;
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700461 $elif M + 1 == MR:
Frank Barchard031ff4b2021-09-07 19:08:32 -0700462 ${VST1_X8}(c${M}, vout${M}x01234567); c${M} += 8;
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700463 $for M in reversed(range(MR)):
464 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700465 vout${M-1}x01234567_${M}x01234567 = ${VCOMBINE_X8}(${VGET_HIGH_X8}(vout${M-1}x0123456789ABCDEF), ${VGET_HIGH_X8}(vout${M}x0123456789ABCDEF));
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700466 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700467 vout${M}x01234567 = ${VGET_HIGH_X8}(vout${M}x0123456789ABCDEF);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700468 }
469 if (nc & 4) {
470 $for M in reversed(range(MR)):
471 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700472 vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), ${VREINTERPRETQ_U32_X8}(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
473 vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), ${VREINTERPRETQ_U32_X8}(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700474 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700475 vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), ${VREINTERPRET_U32_X8}(vout${M}x01234567), 0); c${M} += 4;
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700476 $for M in reversed(range(MR)):
477 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700478 vout${M-1}x01234567_${M}x01234567 = ${VEXTQ_X8}(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700479 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700480 vout${M}x01234567 = ${VEXT_X8}(vout${M}x01234567, vout${M}x01234567, 4);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700481 }
482 if (nc & 2) {
483 $for M in reversed(range(MR)):
484 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700485 vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), ${VREINTERPRETQ_U16_X8}(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
486 vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), ${VREINTERPRETQ_U16_X8}(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700487 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700488 vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), ${VREINTERPRET_U16_X8}(vout${M}x01234567), 0); c${M} += 2;
Frank Barchardd460d0b2021-09-08 11:35:37 -0700489 $for M in reversed(range(MR)):
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700490 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700491 vout${M-1}x01234567_${M}x01234567 = ${VEXTQ_X8}(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700492 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700493 vout${M}x01234567 = ${VEXT_X8}(vout${M}x01234567, vout${M}x01234567, 2);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700494 }
495 if (nc & 1) {
496 $for M in reversed(range(MR)):
497 $if M % 2 == 1:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700498 ${VST1Q_LANE_X8}(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
499 ${VST1Q_LANE_X8}(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700500 $elif M + 1 == MR:
Marat Dukhan69c8a292021-07-14 19:34:56 -0700501 ${VST1_LANE_X8}(c${M}, vout${M}x01234567, 0);
Marat Dukhan40bbafe2020-08-04 02:04:22 -0700502 }
503
504 nc = 0;
505 }
506 } while (nc != 0);
507}