blob: f6b0d27b9cdcc908a01edccb98945e73f998c8ef [file] [log] [blame]
Frank Barchard846c0c62020-10-26 15:01:39 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-spmm/wasmsimd.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <wasm_simd128.h>
13
14#include <xnnpack/spmm.h>
15
16
17void xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm(
18 uint32_t batch_size,
19 uint32_t output_channels,
20 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26{
27 assert(batch_size != 0);
28
Frank Barchardf673b2c2020-11-07 02:18:01 -080029 const uintptr_t output_stride = 1 * batch_size * sizeof(float);
Frank Barchard846c0c62020-10-26 15:01:39 -070030 const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
31 const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
32 size_t n = batch_size;
33 while XNN_LIKELY(n >= 32) {
34 const float*restrict w = weights;
35 const int32_t* dmap = widx_dmap;
36 const uint32_t* nnzmap = nidx_nnzmap;
37 size_t c = output_channels;
38 do {
39 uint32_t nnz = *nnzmap++;
40 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
41 v128_t vacc4567 = vacc0123;
42 v128_t vacc89AB = vacc0123;
43 v128_t vaccCDEF = vacc0123;
44 v128_t vaccGHIJ = vacc0123;
45 v128_t vaccKLMN = vacc0123;
46 v128_t vaccOPQR = vacc0123;
47 v128_t vaccSTUV = vacc0123;
48 if XNN_LIKELY(nnz != 0) {
49 do {
50 const intptr_t diff = *dmap++;
51 const v128_t vi0123 = wasm_v128_load(input);
52 const v128_t vi4567 = wasm_v128_load(input + 4);
53 const v128_t vi89AB = wasm_v128_load(input + 8);
54 const v128_t viCDEF = wasm_v128_load(input + 12);
55 const v128_t viGHIJ = wasm_v128_load(input + 16);
56 const v128_t viKLMN = wasm_v128_load(input + 20);
57 const v128_t viOPQR = wasm_v128_load(input + 24);
58 const v128_t viSTUV = wasm_v128_load(input + 28);
59 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
60 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
61 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
62 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
63 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
64 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
65 vaccGHIJ = wasm_f32x4_add(vaccGHIJ, wasm_f32x4_mul(viGHIJ, vw));
66 vaccKLMN = wasm_f32x4_add(vaccKLMN, wasm_f32x4_mul(viKLMN, vw));
67 vaccOPQR = wasm_f32x4_add(vaccOPQR, wasm_f32x4_mul(viOPQR, vw));
68 vaccSTUV = wasm_f32x4_add(vaccSTUV, wasm_f32x4_mul(viSTUV, vw));
69 } while (--nnz != 0);
70 }
71 v128_t vout0123 = wasm_f32x4_min(vacc0123, vmax);
72 v128_t vout4567 = wasm_f32x4_min(vacc4567, vmax);
73 v128_t vout89AB = wasm_f32x4_min(vacc89AB, vmax);
74 v128_t voutCDEF = wasm_f32x4_min(vaccCDEF, vmax);
75 v128_t voutGHIJ = wasm_f32x4_min(vaccGHIJ, vmax);
76 v128_t voutKLMN = wasm_f32x4_min(vaccKLMN, vmax);
77 v128_t voutOPQR = wasm_f32x4_min(vaccOPQR, vmax);
78 v128_t voutSTUV = wasm_f32x4_min(vaccSTUV, vmax);
79 vout0123 = wasm_f32x4_max(vout0123, vmin);
80 vout4567 = wasm_f32x4_max(vout4567, vmin);
81 vout89AB = wasm_f32x4_max(vout89AB, vmin);
82 voutCDEF = wasm_f32x4_max(voutCDEF, vmin);
83 voutGHIJ = wasm_f32x4_max(voutGHIJ, vmin);
84 voutKLMN = wasm_f32x4_max(voutKLMN, vmin);
85 voutOPQR = wasm_f32x4_max(voutOPQR, vmin);
86 voutSTUV = wasm_f32x4_max(voutSTUV, vmin);
87 wasm_v128_store(output, vout0123);
88 wasm_v128_store(output + 4, vout4567);
89 wasm_v128_store(output + 8, vout89AB);
90 wasm_v128_store(output + 12, voutCDEF);
91 wasm_v128_store(output + 16, voutGHIJ);
92 wasm_v128_store(output + 20, voutKLMN);
93 wasm_v128_store(output + 24, voutOPQR);
94 wasm_v128_store(output + 28, voutSTUV);
Frank Barchardf673b2c2020-11-07 02:18:01 -080095 output = (float*restrict) ((uintptr_t) output + output_stride);
Frank Barchard846c0c62020-10-26 15:01:39 -070096 } while (--c != 0);
97 output -= batch_size * output_channels;
98 output += 32;
99 input += 32;
100 n -= 32;
101 }
102 if XNN_UNLIKELY(n != 0) {
103 if (n & 16) {
104 const float*restrict w = weights;
105 const int32_t* dmap = widx_dmap;
106 const uint32_t* nnzmap = nidx_nnzmap;
107 size_t c = output_channels;
108 do {
109 uint32_t nnz = *nnzmap++;
110 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
111 v128_t vacc4567 = vacc0123;
112 v128_t vacc89AB = vacc0123;
113 v128_t vaccCDEF = vacc0123;
114 if XNN_LIKELY(nnz != 0) {
115 do {
116 const intptr_t diff = *dmap++;
117 const v128_t vi0123 = wasm_v128_load(input);
118 const v128_t vi4567 = wasm_v128_load(input + 4);
119 const v128_t vi89AB = wasm_v128_load(input + 8);
120 const v128_t viCDEF = wasm_v128_load(input + 12);
121 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
122 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
123 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
124 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
125 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
126 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
127 } while (--nnz != 0);
128 }
129 v128_t vout0123 = wasm_f32x4_min(vacc0123, vmax);
130 v128_t vout4567 = wasm_f32x4_min(vacc4567, vmax);
131 v128_t vout89AB = wasm_f32x4_min(vacc89AB, vmax);
132 v128_t voutCDEF = wasm_f32x4_min(vaccCDEF, vmax);
133 vout0123 = wasm_f32x4_max(vout0123, vmin);
134 vout4567 = wasm_f32x4_max(vout4567, vmin);
135 vout89AB = wasm_f32x4_max(vout89AB, vmin);
136 voutCDEF = wasm_f32x4_max(voutCDEF, vmin);
137 wasm_v128_store(output, vout0123);
138
139 wasm_v128_store(output + 4, vout4567);
140 wasm_v128_store(output + 8, vout89AB);
141 wasm_v128_store(output + 12, voutCDEF);
Frank Barchardf673b2c2020-11-07 02:18:01 -0800142 output = (float*restrict) ((uintptr_t) output + output_stride);
Frank Barchard846c0c62020-10-26 15:01:39 -0700143 } while (--c != 0);
144 output -= batch_size * output_channels;
145 output += 16;
146 input += 16;
147 }
148 if (n & 8) {
149 const float*restrict w = weights;
150 const int32_t* dmap = widx_dmap;
151 const uint32_t* nnzmap = nidx_nnzmap;
152 size_t c = output_channels;
153 do {
154 uint32_t nnz = *nnzmap++;
155 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
156 v128_t vacc4567 = vacc0123;
157 if XNN_LIKELY(nnz != 0) {
158 do {
159 const intptr_t diff = *dmap++;
160 const v128_t vi0123 = wasm_v128_load(input);
161 const v128_t vi4567 = wasm_v128_load(input + 4);
162 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
163 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
164 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
165 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
166 } while (--nnz != 0);
167 }
168 v128_t vout0123 = wasm_f32x4_min(vacc0123, vmax);
169 v128_t vout4567 = wasm_f32x4_min(vacc4567, vmax);
170 vout0123 = wasm_f32x4_max(vout0123, vmin);
171 vout4567 = wasm_f32x4_max(vout4567, vmin);
172 wasm_v128_store(output, vout0123);
173
174 wasm_v128_store(output + 4, vout4567);
Frank Barchardf673b2c2020-11-07 02:18:01 -0800175 output = (float*restrict) ((uintptr_t) output + output_stride);
Frank Barchard846c0c62020-10-26 15:01:39 -0700176 } while (--c != 0);
177 output -= batch_size * output_channels;
178 output += 8;
179 input += 8;
180 }
181 if (n & 4) {
182 const float*restrict w = weights;
183 const int32_t* dmap = widx_dmap;
184 const uint32_t* nnzmap = nidx_nnzmap;
185 size_t c = output_channels;
186 do {
187 uint32_t nnz = *nnzmap++;
188 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
189 if XNN_LIKELY(nnz != 0) {
190 do {
191 const intptr_t diff = *dmap++;
192 const v128_t vi0123 = wasm_v128_load(input);
193 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
194 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
195 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
196 } while (--nnz != 0);
197 }
198 v128_t vout0123 = wasm_f32x4_min(vacc0123, vmax);
199 vout0123 = wasm_f32x4_max(vout0123, vmin);
200 wasm_v128_store(output, vout0123);
201
Frank Barchardf673b2c2020-11-07 02:18:01 -0800202 output = (float*restrict) ((uintptr_t) output + output_stride);
Frank Barchard846c0c62020-10-26 15:01:39 -0700203 } while (--c != 0);
204 output -= batch_size * output_channels;
205 output += 4;
206 input += 4;
207 }
208 if (n & 2) {
209 const float*restrict w = weights;
210 const int32_t* dmap = widx_dmap;
211 const uint32_t* nnzmap = nidx_nnzmap;
212 size_t c = output_channels;
213 do {
214 uint32_t nnz = *nnzmap++;
215 v128_t vacc01 = wasm_v32x4_load_splat(w); w += 1;
216 if XNN_LIKELY(nnz != 0) {
217 do {
218 const intptr_t diff = *dmap++;
219 const v128_t vi01 = wasm_v64x2_load_splat(input);
220 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
221 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
222 vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
223 } while (--nnz != 0);
224 }
225 v128_t vout01 = wasm_f32x4_min(vacc01, vmax);
226 vout01 = wasm_f32x4_max(vout01, vmin);
227 *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
228
Frank Barchardf673b2c2020-11-07 02:18:01 -0800229 output = (float*restrict) ((uintptr_t) output + output_stride);
Frank Barchard846c0c62020-10-26 15:01:39 -0700230 } while (--c != 0);
231 output -= batch_size * output_channels;
232 output += 2;
233 input += 2;
234 }
235 if (n & 1) {
236 const float*restrict w = weights;
237 const int32_t* dmap = widx_dmap;
238 const uint32_t* nnzmap = nidx_nnzmap;
239 size_t c = output_channels;
240 do {
241 uint32_t nnz = *nnzmap++;
242 v128_t vacc0 = wasm_v32x4_load_splat(w); w += 1;
243 if XNN_LIKELY(nnz != 0) {
244 do {
245 const intptr_t diff = *dmap++;
246 const v128_t vi0 = wasm_v32x4_load_splat(input);
247 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
248 const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
249 vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
250 } while (--nnz != 0);
251 }
252 v128_t vout0 = wasm_f32x4_min(vacc0, vmax);
253 vout0 = wasm_f32x4_max(vout0, vmin);
254 *output = wasm_f32x4_extract_lane(vout0, 0);
255
Frank Barchardf673b2c2020-11-07 02:18:01 -0800256 output = (float*restrict) ((uintptr_t) output + output_stride);
Frank Barchard846c0c62020-10-26 15:01:39 -0700257 } while (--c != 0);
258 output -= batch_size * output_channels;
259 output += 1;
260 input += 1;
261 }
262 }
263}