blob: 6f93cb985fc02f266edad967805f71afd8f4dcf4 [file] [log] [blame]
Frank Barchardc451e8a2020-10-21 17:13:12 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-spmm/wasmsimd.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <wasm_simd128.h>
13
14#include <xnnpack/spmm.h>
15
16
Frank Barchardbeca6522020-10-30 22:34:35 -070017void xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x2(
Frank Barchardc451e8a2020-10-21 17:13:12 -070018 uint32_t batch_size,
19 uint32_t output_channels,
20 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26{
27 assert(batch_size != 0);
28
29 const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
30 const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
Frank Barchardfea26802020-10-22 10:35:40 -070031 const v128_t vzero = wasm_f32x4_splat(0.0f);
Frank Barchardc451e8a2020-10-21 17:13:12 -070032 size_t n = batch_size;
33 while XNN_LIKELY(n >= 16) {
34 const float*restrict w = weights;
35 const int32_t* dmap = widx_dmap;
36 const uint32_t* nnzmap = nidx_nnzmap;
37 size_t c = output_channels;
38 do {
39 uint32_t nnz = *nnzmap++;
40 v128_t vacc0123x0 = wasm_v32x4_load_splat(w);
41 w += 1;
Frank Barchardfea26802020-10-22 10:35:40 -070042 v128_t vacc0123x1 = vzero;
Frank Barchardc451e8a2020-10-21 17:13:12 -070043 v128_t vacc4567x0 = vacc0123x0;
Frank Barchardfea26802020-10-22 10:35:40 -070044 v128_t vacc4567x1 = vzero;
Frank Barchardc451e8a2020-10-21 17:13:12 -070045 v128_t vacc89ABx0 = vacc0123x0;
Frank Barchardfea26802020-10-22 10:35:40 -070046 v128_t vacc89ABx1 = vzero;
Frank Barchardc451e8a2020-10-21 17:13:12 -070047 v128_t vaccCDEFx0 = vacc0123x0;
Frank Barchardfea26802020-10-22 10:35:40 -070048 v128_t vaccCDEFx1 = vzero;
Frank Barchardc451e8a2020-10-21 17:13:12 -070049 for (; nnz >= 2; nnz -= 2) {
50 const intptr_t diff0 = dmap[0];
51 const intptr_t diff1 = dmap[1];
52 dmap += 2;
53 const v128_t vi0123x0 = wasm_v128_load(input);
54 const v128_t vi4567x0 = wasm_v128_load(input + 4);
55 const v128_t vi89ABx0 = wasm_v128_load(input + 8);
56 const v128_t viCDEFx0 = wasm_v128_load(input + 12);
57 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
58 const v128_t vw0 = wasm_v32x4_load_splat(w);
59 w += 1;
60 vacc0123x0 = wasm_f32x4_add(vacc0123x0, wasm_f32x4_mul(vi0123x0, vw0));
61 vacc4567x0 = wasm_f32x4_add(vacc4567x0, wasm_f32x4_mul(vi4567x0, vw0));
62 vacc89ABx0 = wasm_f32x4_add(vacc89ABx0, wasm_f32x4_mul(vi89ABx0, vw0));
63 vaccCDEFx0 = wasm_f32x4_add(vaccCDEFx0, wasm_f32x4_mul(viCDEFx0, vw0));
64 const v128_t vi0123x1 = wasm_v128_load(input);
65 const v128_t vi4567x1 = wasm_v128_load(input + 4);
66 const v128_t vi89ABx1 = wasm_v128_load(input + 8);
67 const v128_t viCDEFx1 = wasm_v128_load(input + 12);
68 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
69 const v128_t vw1 = wasm_v32x4_load_splat(w);
70 w += 1;
71 vacc0123x1 = wasm_f32x4_add(vacc0123x1, wasm_f32x4_mul(vi0123x1, vw1));
72 vacc4567x1 = wasm_f32x4_add(vacc4567x1, wasm_f32x4_mul(vi4567x1, vw1));
73 vacc89ABx1 = wasm_f32x4_add(vacc89ABx1, wasm_f32x4_mul(vi89ABx1, vw1));
74 vaccCDEFx1 = wasm_f32x4_add(vaccCDEFx1, wasm_f32x4_mul(viCDEFx1, vw1));
75 }
76 v128_t vacc0123 = vacc0123x0;
77 v128_t vacc4567 = vacc4567x0;
78 v128_t vacc89AB = vacc89ABx0;
79 v128_t vaccCDEF = vaccCDEFx0;
80 vacc0123 = wasm_f32x4_add(vacc0123, vacc0123x1);
81 vacc4567 = wasm_f32x4_add(vacc4567, vacc4567x1);
82 vacc89AB = wasm_f32x4_add(vacc89AB, vacc89ABx1);
83 vaccCDEF = wasm_f32x4_add(vaccCDEF, vaccCDEFx1);
84 if XNN_LIKELY(nnz != 0) {
85 do {
86 const intptr_t diff = *dmap++;
87 const v128_t vi0123 = wasm_v128_load(input);
88 const v128_t vi4567 = wasm_v128_load(input + 4);
89 const v128_t vi89AB = wasm_v128_load(input + 8);
90 const v128_t viCDEF = wasm_v128_load(input + 12);
91 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
92 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
93 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
94 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
95 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
96 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
97 } while (--nnz != 0);
98 }
99 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
100 v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
101 v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
102 v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
103 vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
104 vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
105 vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
106 voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
107 wasm_v128_store(output, vout0123);
108 wasm_v128_store(output + 4, vout4567);
109 wasm_v128_store(output + 8, vout89AB);
110 wasm_v128_store(output + 12, voutCDEF);
111 output += 1 * batch_size;
112 } while (--c != 0);
113 output -= batch_size * output_channels;
114 output += 16;
115 input += 16;
116 n -= 16;
117 }
118 if XNN_UNLIKELY(n != 0) {
119 if (n & 8) {
120 const float*restrict w = weights;
121 const int32_t* dmap = widx_dmap;
122 const uint32_t* nnzmap = nidx_nnzmap;
123 size_t c = output_channels;
124 do {
125 uint32_t nnz = *nnzmap++;
126 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
127 v128_t vacc4567 = vacc0123;
128 if XNN_LIKELY(nnz != 0) {
129 do {
130 const intptr_t diff = *dmap++;
131 const v128_t vi0123 = wasm_v128_load(input);
132 const v128_t vi4567 = wasm_v128_load(input + 4);
133 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
134 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
135 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
136 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
137 } while (--nnz != 0);
138 }
139 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
140 v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
141 vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
142 vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
143 wasm_v128_store(output, vout0123);
144
145 wasm_v128_store(output + 4, vout4567);
146 output += 1 * batch_size;
147 } while (--c != 0);
148 output -= batch_size * output_channels;
149 output += 8;
150 input += 8;
151 }
152 if (n & 4) {
153 const float*restrict w = weights;
154 const int32_t* dmap = widx_dmap;
155 const uint32_t* nnzmap = nidx_nnzmap;
156 size_t c = output_channels;
157 do {
158 uint32_t nnz = *nnzmap++;
159 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
160 if XNN_LIKELY(nnz != 0) {
161 do {
162 const intptr_t diff = *dmap++;
163 const v128_t vi0123 = wasm_v128_load(input);
164 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
165 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
166 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
167 } while (--nnz != 0);
168 }
169 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
170 vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
171 wasm_v128_store(output, vout0123);
172
173 output += 1 * batch_size;
174 } while (--c != 0);
175 output -= batch_size * output_channels;
176 output += 4;
177 input += 4;
178 }
179 if (n & 2) {
180 const float*restrict w = weights;
181 const int32_t* dmap = widx_dmap;
182 const uint32_t* nnzmap = nidx_nnzmap;
183 size_t c = output_channels;
184 do {
185 uint32_t nnz = *nnzmap++;
186 v128_t vacc01 = wasm_v32x4_load_splat(w); w += 1;
187 if XNN_LIKELY(nnz != 0) {
188 do {
189 const intptr_t diff = *dmap++;
190 const v128_t vi01 = wasm_v64x2_load_splat(input);
191 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
192 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
193 vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
194 } while (--nnz != 0);
195 }
196 v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
197 vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
198 *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
199
200 output += 1 * batch_size;
201 } while (--c != 0);
202 output -= batch_size * output_channels;
203 output += 2;
204 input += 2;
205 }
206 if (n & 1) {
207 const float*restrict w = weights;
208 const int32_t* dmap = widx_dmap;
209 const uint32_t* nnzmap = nidx_nnzmap;
210 size_t c = output_channels;
211 do {
212 uint32_t nnz = *nnzmap++;
213 v128_t vacc0 = wasm_v32x4_load_splat(w); w += 1;
214 if XNN_LIKELY(nnz != 0) {
215 do {
216 const intptr_t diff = *dmap++;
217 const v128_t vi0 = wasm_v32x4_load_splat(input);
218 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
219 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
220 vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
221 } while (--nnz != 0);
222 }
223 v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
224 vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
225 *output = wasm_f32x4_extract_lane(vout0, 0);
226
227 output += 1 * batch_size;
228 } while (--c != 0);
229 output -= batch_size * output_channels;
230 output += 1;
231 input += 1;
232 }
233 }
234}