blob: 2e411f621936a3e479f5f5a85fb4237a9c382a20 [file] [log] [blame]
Frank Barchardc451e8a2020-10-21 17:13:12 -07001// Auto-generated file. Do not edit!
2// Template: src/f32-spmm/wasmsimd.c.in
3// Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <assert.h>
11
12#include <wasm_simd128.h>
13
14#include <xnnpack/spmm.h>
15
16
Frank Barchardbeca6522020-10-30 22:34:35 -070017void xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x2(
Marat Dukhane8bfcc82020-11-16 12:28:13 -080018 size_t mc,
19 size_t nc,
Frank Barchardc451e8a2020-10-21 17:13:12 -070020 const float*restrict input,
21 const float*restrict weights,
22 const int32_t*restrict widx_dmap,
23 const uint32_t*restrict nidx_nnzmap,
24 float*restrict output,
Marat Dukhane8bfcc82020-11-16 12:28:13 -080025 size_t output_stride,
Frank Barchardc451e8a2020-10-21 17:13:12 -070026 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27{
Marat Dukhane8bfcc82020-11-16 12:28:13 -080028 assert(mc != 0);
29 assert(mc % sizeof(float) == 0);
30 assert(nc != 0);
Frank Barchardc451e8a2020-10-21 17:13:12 -070031
Marat Dukhanc83ef3b2021-12-30 09:47:07 -080032 const v128_t vmin = wasm_v128_load64_splat(params->wasmsimd.min);
33 const v128_t vmax = wasm_v128_load64_splat(params->wasmsimd.max);
Marat Dukhane8bfcc82020-11-16 12:28:13 -080034 size_t output_decrement = output_stride * nc - 16 * sizeof(float);
35 while XNN_LIKELY(mc >= 16 * sizeof(float)) {
Frank Barchardc451e8a2020-10-21 17:13:12 -070036 const float*restrict w = weights;
37 const int32_t* dmap = widx_dmap;
38 const uint32_t* nnzmap = nidx_nnzmap;
Marat Dukhane8bfcc82020-11-16 12:28:13 -080039 size_t n = nc;
Frank Barchardc451e8a2020-10-21 17:13:12 -070040 do {
41 uint32_t nnz = *nnzmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -070042 v128_t vacc0123x0 = wasm_v128_load32_splat(w);
Frank Barchardc451e8a2020-10-21 17:13:12 -070043 w += 1;
Marat Dukhan48109052021-08-31 17:31:57 -070044 v128_t vacc0123x1 = wasm_f32x4_const_splat(0.0f);
Frank Barchardc451e8a2020-10-21 17:13:12 -070045 v128_t vacc4567x0 = vacc0123x0;
Marat Dukhan48109052021-08-31 17:31:57 -070046 v128_t vacc4567x1 = wasm_f32x4_const_splat(0.0f);
Frank Barchardc451e8a2020-10-21 17:13:12 -070047 v128_t vacc89ABx0 = vacc0123x0;
Marat Dukhan48109052021-08-31 17:31:57 -070048 v128_t vacc89ABx1 = wasm_f32x4_const_splat(0.0f);
Frank Barchardc451e8a2020-10-21 17:13:12 -070049 v128_t vaccCDEFx0 = vacc0123x0;
Marat Dukhan48109052021-08-31 17:31:57 -070050 v128_t vaccCDEFx1 = wasm_f32x4_const_splat(0.0f);
Frank Barchardc451e8a2020-10-21 17:13:12 -070051 for (; nnz >= 2; nnz -= 2) {
52 const intptr_t diff0 = dmap[0];
53 const intptr_t diff1 = dmap[1];
54 dmap += 2;
55 const v128_t vi0123x0 = wasm_v128_load(input);
56 const v128_t vi4567x0 = wasm_v128_load(input + 4);
57 const v128_t vi89ABx0 = wasm_v128_load(input + 8);
58 const v128_t viCDEFx0 = wasm_v128_load(input + 12);
59 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
Marat Dukhanee029b22021-06-30 12:47:02 -070060 const v128_t vw0 = wasm_v128_load32_splat(w);
Frank Barchardc451e8a2020-10-21 17:13:12 -070061 w += 1;
62 vacc0123x0 = wasm_f32x4_add(vacc0123x0, wasm_f32x4_mul(vi0123x0, vw0));
63 vacc4567x0 = wasm_f32x4_add(vacc4567x0, wasm_f32x4_mul(vi4567x0, vw0));
64 vacc89ABx0 = wasm_f32x4_add(vacc89ABx0, wasm_f32x4_mul(vi89ABx0, vw0));
65 vaccCDEFx0 = wasm_f32x4_add(vaccCDEFx0, wasm_f32x4_mul(viCDEFx0, vw0));
66 const v128_t vi0123x1 = wasm_v128_load(input);
67 const v128_t vi4567x1 = wasm_v128_load(input + 4);
68 const v128_t vi89ABx1 = wasm_v128_load(input + 8);
69 const v128_t viCDEFx1 = wasm_v128_load(input + 12);
70 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
Marat Dukhanee029b22021-06-30 12:47:02 -070071 const v128_t vw1 = wasm_v128_load32_splat(w);
Frank Barchardc451e8a2020-10-21 17:13:12 -070072 w += 1;
73 vacc0123x1 = wasm_f32x4_add(vacc0123x1, wasm_f32x4_mul(vi0123x1, vw1));
74 vacc4567x1 = wasm_f32x4_add(vacc4567x1, wasm_f32x4_mul(vi4567x1, vw1));
75 vacc89ABx1 = wasm_f32x4_add(vacc89ABx1, wasm_f32x4_mul(vi89ABx1, vw1));
76 vaccCDEFx1 = wasm_f32x4_add(vaccCDEFx1, wasm_f32x4_mul(viCDEFx1, vw1));
77 }
78 v128_t vacc0123 = vacc0123x0;
79 v128_t vacc4567 = vacc4567x0;
80 v128_t vacc89AB = vacc89ABx0;
81 v128_t vaccCDEF = vaccCDEFx0;
82 vacc0123 = wasm_f32x4_add(vacc0123, vacc0123x1);
83 vacc4567 = wasm_f32x4_add(vacc4567, vacc4567x1);
84 vacc89AB = wasm_f32x4_add(vacc89AB, vacc89ABx1);
85 vaccCDEF = wasm_f32x4_add(vaccCDEF, vaccCDEFx1);
86 if XNN_LIKELY(nnz != 0) {
87 do {
88 const intptr_t diff = *dmap++;
89 const v128_t vi0123 = wasm_v128_load(input);
90 const v128_t vi4567 = wasm_v128_load(input + 4);
91 const v128_t vi89AB = wasm_v128_load(input + 8);
92 const v128_t viCDEF = wasm_v128_load(input + 12);
93 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
Marat Dukhanee029b22021-06-30 12:47:02 -070094 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -070095 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
96 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
97 vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
98 vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
99 } while (--nnz != 0);
100 }
Marat Dukhan0bf8afa2021-09-20 10:02:18 -0700101 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
102 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
103 v128_t vout89AB = wasm_f32x4_pmin(vmax, vacc89AB);
104 v128_t voutCDEF = wasm_f32x4_pmin(vmax, vaccCDEF);
105 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
106 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
107 vout89AB = wasm_f32x4_pmax(vmin, vout89AB);
108 voutCDEF = wasm_f32x4_pmax(vmin, voutCDEF);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700109 wasm_v128_store(output, vout0123);
110 wasm_v128_store(output + 4, vout4567);
111 wasm_v128_store(output + 8, vout89AB);
112 wasm_v128_store(output + 12, voutCDEF);
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800113 output = (float*restrict) ((uintptr_t) output + output_stride);
114 } while (--n != 0);
Marat Dukhane278a552020-11-14 16:14:58 -0800115 output = (float*restrict) ((uintptr_t) output - output_decrement);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700116 input += 16;
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800117 mc -= 16 * sizeof(float);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700118 }
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800119 if XNN_UNLIKELY(mc != 0) {
Marat Dukhane278a552020-11-14 16:14:58 -0800120 output_decrement += 8 * sizeof(float);
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800121 if (mc & (8 * sizeof(float))) {
Frank Barchardc451e8a2020-10-21 17:13:12 -0700122 const float*restrict w = weights;
123 const int32_t* dmap = widx_dmap;
124 const uint32_t* nnzmap = nidx_nnzmap;
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800125 size_t n = nc;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700126 do {
127 uint32_t nnz = *nnzmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -0700128 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700129 v128_t vacc4567 = vacc0123;
130 if XNN_LIKELY(nnz != 0) {
131 do {
132 const intptr_t diff = *dmap++;
133 const v128_t vi0123 = wasm_v128_load(input);
134 const v128_t vi4567 = wasm_v128_load(input + 4);
135 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
Marat Dukhanee029b22021-06-30 12:47:02 -0700136 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700137 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
138 vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
139 } while (--nnz != 0);
140 }
Marat Dukhan0bf8afa2021-09-20 10:02:18 -0700141 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
142 v128_t vout4567 = wasm_f32x4_pmin(vmax, vacc4567);
143 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
144 vout4567 = wasm_f32x4_pmax(vmin, vout4567);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700145 wasm_v128_store(output, vout0123);
146
147 wasm_v128_store(output + 4, vout4567);
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800148 output = (float*restrict) ((uintptr_t) output + output_stride);
149 } while (--n != 0);
Marat Dukhane278a552020-11-14 16:14:58 -0800150 output = (float*restrict) ((uintptr_t) output - output_decrement);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700151 input += 8;
152 }
Marat Dukhane278a552020-11-14 16:14:58 -0800153 output_decrement += 4 * sizeof(float);
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800154 if (mc & (4 * sizeof(float))) {
Frank Barchardc451e8a2020-10-21 17:13:12 -0700155 const float*restrict w = weights;
156 const int32_t* dmap = widx_dmap;
157 const uint32_t* nnzmap = nidx_nnzmap;
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800158 size_t n = nc;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700159 do {
160 uint32_t nnz = *nnzmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -0700161 v128_t vacc0123 = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700162 if XNN_LIKELY(nnz != 0) {
163 do {
164 const intptr_t diff = *dmap++;
165 const v128_t vi0123 = wasm_v128_load(input);
166 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
Marat Dukhanee029b22021-06-30 12:47:02 -0700167 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700168 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
169 } while (--nnz != 0);
170 }
Marat Dukhan0bf8afa2021-09-20 10:02:18 -0700171 v128_t vout0123 = wasm_f32x4_pmin(vmax, vacc0123);
172 vout0123 = wasm_f32x4_pmax(vmin, vout0123);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700173 wasm_v128_store(output, vout0123);
174
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800175 output = (float*restrict) ((uintptr_t) output + output_stride);
176 } while (--n != 0);
Marat Dukhane278a552020-11-14 16:14:58 -0800177 output = (float*restrict) ((uintptr_t) output - output_decrement);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700178 input += 4;
179 }
Marat Dukhane278a552020-11-14 16:14:58 -0800180 output_decrement += 2 * sizeof(float);
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800181 if (mc & (2 * sizeof(float))) {
Frank Barchardc451e8a2020-10-21 17:13:12 -0700182 const float*restrict w = weights;
183 const int32_t* dmap = widx_dmap;
184 const uint32_t* nnzmap = nidx_nnzmap;
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800185 size_t n = nc;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700186 do {
187 uint32_t nnz = *nnzmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -0700188 v128_t vacc01 = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700189 if XNN_LIKELY(nnz != 0) {
190 do {
191 const intptr_t diff = *dmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -0700192 const v128_t vi01 = wasm_v128_load64_splat(input);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700193 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
Marat Dukhanee029b22021-06-30 12:47:02 -0700194 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700195 vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
196 } while (--nnz != 0);
197 }
Marat Dukhan0bf8afa2021-09-20 10:02:18 -0700198 v128_t vout01 = wasm_f32x4_pmin(vmax, vacc01);
199 vout01 = wasm_f32x4_pmax(vmin, vout01);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700200 *((double*) output) = wasm_f64x2_extract_lane(vout01, 0);
201
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800202 output = (float*restrict) ((uintptr_t) output + output_stride);
203 } while (--n != 0);
Marat Dukhane278a552020-11-14 16:14:58 -0800204 output = (float*restrict) ((uintptr_t) output - output_decrement);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700205 input += 2;
206 }
Marat Dukhane278a552020-11-14 16:14:58 -0800207 output_decrement += 1 * sizeof(float);
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800208 if (mc & (1 * sizeof(float))) {
Frank Barchardc451e8a2020-10-21 17:13:12 -0700209 const float*restrict w = weights;
210 const int32_t* dmap = widx_dmap;
211 const uint32_t* nnzmap = nidx_nnzmap;
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800212 size_t n = nc;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700213 do {
214 uint32_t nnz = *nnzmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -0700215 v128_t vacc0 = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700216 if XNN_LIKELY(nnz != 0) {
217 do {
218 const intptr_t diff = *dmap++;
Marat Dukhanee029b22021-06-30 12:47:02 -0700219 const v128_t vi0 = wasm_v128_load32_splat(input);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700220 input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
Marat Dukhanee029b22021-06-30 12:47:02 -0700221 const v128_t vw = wasm_v128_load32_splat(w); w += 1;
Frank Barchardc451e8a2020-10-21 17:13:12 -0700222 vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
223 } while (--nnz != 0);
224 }
Marat Dukhan0bf8afa2021-09-20 10:02:18 -0700225 v128_t vout0 = wasm_f32x4_pmin(vmax, vacc0);
226 vout0 = wasm_f32x4_pmax(vmin, vout0);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700227 *output = wasm_f32x4_extract_lane(vout0, 0);
228
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800229 output = (float*restrict) ((uintptr_t) output + output_stride);
230 } while (--n != 0);
Marat Dukhane278a552020-11-14 16:14:58 -0800231 output = (float*restrict) ((uintptr_t) output - output_decrement);
Frank Barchardc451e8a2020-10-21 17:13:12 -0700232 input += 1;
233 }
234 }
235}