blob: f7e2de180d96b91aa936760f7bc74f4a176a4392 [file] [log] [blame]
Marat Dukhancc8f34c2020-08-05 16:36:38 -07001// Auto-generated file. Do not edit!
Marat Dukhan66ccf642020-09-28 16:23:42 -07002// Template: src/qs8-dwconv/unipass-wasmsimd-mul16.c.in
Marat Dukhancc8f34c2020-08-05 16:36:38 -07003// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <wasm_simd128.h>
13
14#include <xnnpack/dwconv.h>
15
16
Marat Dukhanb07c26a2021-05-24 19:44:51 -070017void xnn_qs8_dwconv_minmax_gemmlowp_ukernel_up8x9__wasmsimd_mul16(
Marat Dukhancc8f34c2020-08-05 16:36:38 -070018 size_t channels,
19 size_t output_width,
20 const int8_t** input,
21 const void* weights,
22 int8_t* output,
23 size_t input_stride,
24 size_t output_increment,
25 size_t input_offset,
26 const int8_t* zero,
Marat Dukhane3d17bf2021-05-24 22:22:43 -070027 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
Marat Dukhancc8f34c2020-08-05 16:36:38 -070028{
29 assert(channels != 0);
30 assert(output_width != 0);
31
32 do {
33 const int8_t* i0 = input[0];
34 assert(i0 != NULL);
35 if XNN_UNPREDICTABLE(i0 != zero) {
36 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
37 }
38 const int8_t* i1 = input[1];
39 assert(i1 != NULL);
40 if XNN_UNPREDICTABLE(i1 != zero) {
41 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
42 }
43 const int8_t* i2 = input[2];
44 assert(i2 != NULL);
45 if XNN_UNPREDICTABLE(i2 != zero) {
46 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
47 }
48 const int8_t* i3 = input[3];
49 assert(i3 != NULL);
50 if XNN_UNPREDICTABLE(i3 != zero) {
51 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
52 }
53 const int8_t* i4 = input[4];
54 assert(i4 != NULL);
55 if XNN_UNPREDICTABLE(i4 != zero) {
56 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
57 }
58 const int8_t* i5 = input[5];
59 assert(i5 != NULL);
60 if XNN_UNPREDICTABLE(i5 != zero) {
61 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
62 }
63 const int8_t* i6 = input[6];
64 assert(i6 != NULL);
65 if XNN_UNPREDICTABLE(i6 != zero) {
66 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
67 }
68 const int8_t* i7 = input[7];
69 assert(i7 != NULL);
70 if XNN_UNPREDICTABLE(i7 != zero) {
71 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
72 }
73 const int8_t* i8 = input[8];
74 assert(i8 != NULL);
75 if XNN_UNPREDICTABLE(i8 != zero) {
76 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
77 }
78 input = (const int8_t**) ((uintptr_t) input + input_stride);
79
80 size_t c = channels;
Marat Dukhan023bcf92020-08-10 12:40:50 -070081 const void* w = weights;
Marat Dukhancc8f34c2020-08-05 16:36:38 -070082 for (; c >= 8; c -= 8) {
83 v128_t vacc0123 = wasm_v128_load(w);
84 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
85
86
Marat Dukhanee029b22021-06-30 12:47:02 -070087 const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0);
88 const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -070089 i0 += 8;
90
91 const v128_t vprod0x01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567);
92
Marat Dukhanee029b22021-06-30 12:47:02 -070093 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod0x01234567));
94 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod0x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -070095
Marat Dukhanee029b22021-06-30 12:47:02 -070096 const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1);
97 const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -070098 i1 += 8;
99
100 const v128_t vprod1x01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567);
101
Marat Dukhanee029b22021-06-30 12:47:02 -0700102 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod1x01234567));
103 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod1x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700104
Marat Dukhanee029b22021-06-30 12:47:02 -0700105 const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2);
106 const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700107 i2 += 8;
108
109 const v128_t vprod2x01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567);
110
Marat Dukhanee029b22021-06-30 12:47:02 -0700111 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod2x01234567));
112 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod2x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700113
Marat Dukhanee029b22021-06-30 12:47:02 -0700114 const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3);
115 const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700116 i3 += 8;
117
118 const v128_t vprod3x01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567);
119
Marat Dukhanee029b22021-06-30 12:47:02 -0700120 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod3x01234567));
121 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod3x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700122
Marat Dukhanee029b22021-06-30 12:47:02 -0700123 const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4);
124 const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700125 i4 += 8;
126
127 const v128_t vprod4x01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567);
128
Marat Dukhanee029b22021-06-30 12:47:02 -0700129 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod4x01234567));
130 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod4x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700131
Marat Dukhanee029b22021-06-30 12:47:02 -0700132 const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5);
133 const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700134 i5 += 8;
135
136 const v128_t vprod5x01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567);
137
Marat Dukhanee029b22021-06-30 12:47:02 -0700138 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod5x01234567));
139 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod5x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700140
Marat Dukhanee029b22021-06-30 12:47:02 -0700141 const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6);
142 const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700143 i6 += 8;
144
145 const v128_t vprod6x01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567);
146
Marat Dukhanee029b22021-06-30 12:47:02 -0700147 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod6x01234567));
148 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod6x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700149
Marat Dukhanee029b22021-06-30 12:47:02 -0700150 const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7);
151 const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700152 i7 += 8;
153
154 const v128_t vprod7x01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567);
155
Marat Dukhanee029b22021-06-30 12:47:02 -0700156 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod7x01234567));
157 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod7x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700158
Marat Dukhanee029b22021-06-30 12:47:02 -0700159 const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8);
160 const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700161 i8 += 8;
162
163 const v128_t vprod8x01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567);
164
Marat Dukhanee029b22021-06-30 12:47:02 -0700165 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod8x01234567));
166 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod8x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700167
168 w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 72 * sizeof(int8_t));
169
170 const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31);
171 const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31);
172
173 const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5);
174 const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7);
175 const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5);
176 const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7);
177
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700178 const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
179 const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700180 const v128_t vprod01 = wasm_i64x2_add(wasm_i64x2_mul(vacc01, vmultiplier), vrounding);
181 const v128_t vprod23 = wasm_i64x2_add(wasm_i64x2_mul(vacc23, vmultiplier), vrounding);
182 const v128_t vprod45 = wasm_i64x2_add(wasm_i64x2_mul(vacc45, vmultiplier), vrounding);
183 const v128_t vprod67 = wasm_i64x2_add(wasm_i64x2_mul(vacc67, vmultiplier), vrounding);
184
185 const v128_t vq31prod0123 = wasm_v32x4_shuffle(vprod01, vprod23, 1, 3, 5, 7);
186 const v128_t vq31prod4567 = wasm_v32x4_shuffle(vprod45, vprod67, 1, 3, 5, 7);
187
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700188 const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700189 const v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0123, vremainder_mask), wasm_i32x4_shr(vq31prod0123, 31));
190 const v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vq31prod4567, vremainder_mask), wasm_i32x4_shr(vq31prod4567, 31));
191
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700192 const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
193 const int32_t vshift = params->gemmlowp_wasmsimd.shift;
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700194 vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0123, vshift), wasm_i32x4_gt(vrem0123, vthreshold));
195 vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod4567, vshift), wasm_i32x4_gt(vrem4567, vthreshold));
196
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700197 const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
Marat Dukhanee029b22021-06-30 12:47:02 -0700198 v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700199
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700200 const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
201 const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700202 v128_t vout0123456701234567 = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout01234567, vout01234567), voutput_min), voutput_max);
203
204 *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
205 output += 8;
206 }
207 if XNN_UNLIKELY(c != 0) {
208 {
209 v128_t vacc0123 = wasm_v128_load(w);
210 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
211
212
Marat Dukhanee029b22021-06-30 12:47:02 -0700213 const v128_t vi0x01234567 = wasm_i16x8_load8x8(i0);
214 const v128_t vk0x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 0 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700215
216 const v128_t vprod0x01234567 = wasm_i16x8_mul(vi0x01234567, vk0x01234567);
217
Marat Dukhanee029b22021-06-30 12:47:02 -0700218 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod0x01234567));
219 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod0x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700220
Marat Dukhanee029b22021-06-30 12:47:02 -0700221 const v128_t vi1x01234567 = wasm_i16x8_load8x8(i1);
222 const v128_t vk1x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 8 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700223
224 const v128_t vprod1x01234567 = wasm_i16x8_mul(vi1x01234567, vk1x01234567);
225
Marat Dukhanee029b22021-06-30 12:47:02 -0700226 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod1x01234567));
227 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod1x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700228
Marat Dukhanee029b22021-06-30 12:47:02 -0700229 const v128_t vi2x01234567 = wasm_i16x8_load8x8(i2);
230 const v128_t vk2x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 16 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700231
232 const v128_t vprod2x01234567 = wasm_i16x8_mul(vi2x01234567, vk2x01234567);
233
Marat Dukhanee029b22021-06-30 12:47:02 -0700234 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod2x01234567));
235 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod2x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700236
Marat Dukhanee029b22021-06-30 12:47:02 -0700237 const v128_t vi3x01234567 = wasm_i16x8_load8x8(i3);
238 const v128_t vk3x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 24 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700239
240 const v128_t vprod3x01234567 = wasm_i16x8_mul(vi3x01234567, vk3x01234567);
241
Marat Dukhanee029b22021-06-30 12:47:02 -0700242 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod3x01234567));
243 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod3x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700244
Marat Dukhanee029b22021-06-30 12:47:02 -0700245 const v128_t vi4x01234567 = wasm_i16x8_load8x8(i4);
246 const v128_t vk4x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 32 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700247
248 const v128_t vprod4x01234567 = wasm_i16x8_mul(vi4x01234567, vk4x01234567);
249
Marat Dukhanee029b22021-06-30 12:47:02 -0700250 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod4x01234567));
251 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod4x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700252
Marat Dukhanee029b22021-06-30 12:47:02 -0700253 const v128_t vi5x01234567 = wasm_i16x8_load8x8(i5);
254 const v128_t vk5x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 40 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700255
256 const v128_t vprod5x01234567 = wasm_i16x8_mul(vi5x01234567, vk5x01234567);
257
Marat Dukhanee029b22021-06-30 12:47:02 -0700258 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod5x01234567));
259 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod5x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700260
Marat Dukhanee029b22021-06-30 12:47:02 -0700261 const v128_t vi6x01234567 = wasm_i16x8_load8x8(i6);
262 const v128_t vk6x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 48 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700263
264 const v128_t vprod6x01234567 = wasm_i16x8_mul(vi6x01234567, vk6x01234567);
265
Marat Dukhanee029b22021-06-30 12:47:02 -0700266 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod6x01234567));
267 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod6x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700268
Marat Dukhanee029b22021-06-30 12:47:02 -0700269 const v128_t vi7x01234567 = wasm_i16x8_load8x8(i7);
270 const v128_t vk7x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 56 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700271
272 const v128_t vprod7x01234567 = wasm_i16x8_mul(vi7x01234567, vk7x01234567);
273
Marat Dukhanee029b22021-06-30 12:47:02 -0700274 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod7x01234567));
275 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod7x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700276
Marat Dukhanee029b22021-06-30 12:47:02 -0700277 const v128_t vi8x01234567 = wasm_i16x8_load8x8(i8);
278 const v128_t vk8x01234567 = wasm_i16x8_load8x8((const void*) ((uintptr_t) w + 8 * sizeof(int32_t) + 64 * sizeof(int8_t)));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700279
280 const v128_t vprod8x01234567 = wasm_i16x8_mul(vi8x01234567, vk8x01234567);
281
Marat Dukhanee029b22021-06-30 12:47:02 -0700282 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_extend_low_i16x8(vprod8x01234567));
283 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_extend_high_i16x8(vprod8x01234567));
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700284
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700285
286 const v128_t vsign0123 = wasm_i32x4_shr(vacc0123, 31);
287 const v128_t vsign4567 = wasm_i32x4_shr(vacc4567, 31);
288
289 const v128_t vacc01 = wasm_v32x4_shuffle(vacc0123, vsign0123, 0, 4, 1, 5);
290 const v128_t vacc23 = wasm_v32x4_shuffle(vacc0123, vsign0123, 2, 6, 3, 7);
291 const v128_t vacc45 = wasm_v32x4_shuffle(vacc4567, vsign4567, 0, 4, 1, 5);
292 const v128_t vacc67 = wasm_v32x4_shuffle(vacc4567, vsign4567, 2, 6, 3, 7);
293
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700294 const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
295 const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700296 const v128_t vprod01 = wasm_i64x2_add(wasm_i64x2_mul(vacc01, vmultiplier), vrounding);
297 const v128_t vprod23 = wasm_i64x2_add(wasm_i64x2_mul(vacc23, vmultiplier), vrounding);
298 const v128_t vprod45 = wasm_i64x2_add(wasm_i64x2_mul(vacc45, vmultiplier), vrounding);
299 const v128_t vprod67 = wasm_i64x2_add(wasm_i64x2_mul(vacc67, vmultiplier), vrounding);
300
301 const v128_t vq31prod0123 = wasm_v32x4_shuffle(vprod01, vprod23, 1, 3, 5, 7);
302 const v128_t vq31prod4567 = wasm_v32x4_shuffle(vprod45, vprod67, 1, 3, 5, 7);
303
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700304 const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700305 const v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0123, vremainder_mask), wasm_i32x4_shr(vq31prod0123, 31));
306 const v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vq31prod4567, vremainder_mask), wasm_i32x4_shr(vq31prod4567, 31));
307
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700308 const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
309 const int32_t vshift = params->gemmlowp_wasmsimd.shift;
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700310 vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0123, vshift), wasm_i32x4_gt(vrem0123, vthreshold));
311 vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod4567, vshift), wasm_i32x4_gt(vrem4567, vthreshold));
312
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700313 const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
Marat Dukhanee029b22021-06-30 12:47:02 -0700314 v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700315
Marat Dukhan9b474cf2021-05-25 16:37:48 -0700316 const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
317 const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700318 v128_t vout0123456701234567 = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout01234567, vout01234567), voutput_min), voutput_max);
319
Marat Dukhan313eef72021-06-30 16:11:31 -0700320
Marat Dukhancc8f34c2020-08-05 16:36:38 -0700321 if (c & 4) {
322 *((float*) output) = wasm_f32x4_extract_lane(vout0123456701234567, 0);
323 vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
324 output += 4;
325 }
326 if (c & 2) {
327 *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout0123456701234567, 0);
328 vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16);
329 output += 2;
330 }
331 if (c & 1) {
332 *output = (int8_t) wasm_i8x16_extract_lane(vout0123456701234567, 0);
333 output += 1;
334 }
335 }
336 }
337
338 output = (int8_t*) ((uintptr_t) output + output_increment);
339 } while (--output_width != 0);
340}