Blame - src/f32-spmm/gen/16x1-minmax-wasmsimd-x86-x2.c - platform/external/XNNPACK

blob: 6f93cb985fc02f266edad967805f71afd8f4dcf4 [file] [log] [blame]

Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/f32-spmm/wasmsimd.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2020 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <wasm_simd128.h>
				13
				14	#include <xnnpack/spmm.h>
				15
				16
Frank Barchard	beca652	2020-10-30 22:34:35 -0700	[diff] [blame^]	17	void xnn_f32_spmm_minmax_ukernel_16x1__wasmsimd_x86_x2(
Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	18	uint32_t batch_size,
				19	uint32_t output_channels,
				20	const float*restrict input,
				21	const float*restrict weights,
				22	const int32_t*restrict widx_dmap,
				23	const uint32_t*restrict nidx_nnzmap,
				24	float*restrict output,
				25	const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
				26	{
				27	assert(batch_size != 0);
				28
				29	const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
				30	const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
Frank Barchard	fea2680	2020-10-22 10:35:40 -0700	[diff] [blame]	31	const v128_t vzero = wasm_f32x4_splat(0.0f);
Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	32	size_t n = batch_size;
				33	while XNN_LIKELY(n >= 16) {
				34	const float*restrict w = weights;
				35	const int32_t* dmap = widx_dmap;
				36	const uint32_t* nnzmap = nidx_nnzmap;
				37	size_t c = output_channels;
				38	do {
				39	uint32_t nnz = *nnzmap++;
				40	v128_t vacc0123x0 = wasm_v32x4_load_splat(w);
				41	w += 1;
Frank Barchard	fea2680	2020-10-22 10:35:40 -0700	[diff] [blame]	42	v128_t vacc0123x1 = vzero;
Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	43	v128_t vacc4567x0 = vacc0123x0;
Frank Barchard	fea2680	2020-10-22 10:35:40 -0700	[diff] [blame]	44	v128_t vacc4567x1 = vzero;
Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	45	v128_t vacc89ABx0 = vacc0123x0;
Frank Barchard	fea2680	2020-10-22 10:35:40 -0700	[diff] [blame]	46	v128_t vacc89ABx1 = vzero;
Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	47	v128_t vaccCDEFx0 = vacc0123x0;
Frank Barchard	fea2680	2020-10-22 10:35:40 -0700	[diff] [blame]	48	v128_t vaccCDEFx1 = vzero;
Frank Barchard	c451e8a	2020-10-21 17:13:12 -0700	[diff] [blame]	49	for (; nnz >= 2; nnz -= 2) {
				50	const intptr_t diff0 = dmap[0];
				51	const intptr_t diff1 = dmap[1];
				52	dmap += 2;
				53	const v128_t vi0123x0 = wasm_v128_load(input);
				54	const v128_t vi4567x0 = wasm_v128_load(input + 4);
				55	const v128_t vi89ABx0 = wasm_v128_load(input + 8);
				56	const v128_t viCDEFx0 = wasm_v128_load(input + 12);
				57	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff0);
				58	const v128_t vw0 = wasm_v32x4_load_splat(w);
				59	w += 1;
				60	vacc0123x0 = wasm_f32x4_add(vacc0123x0, wasm_f32x4_mul(vi0123x0, vw0));
				61	vacc4567x0 = wasm_f32x4_add(vacc4567x0, wasm_f32x4_mul(vi4567x0, vw0));
				62	vacc89ABx0 = wasm_f32x4_add(vacc89ABx0, wasm_f32x4_mul(vi89ABx0, vw0));
				63	vaccCDEFx0 = wasm_f32x4_add(vaccCDEFx0, wasm_f32x4_mul(viCDEFx0, vw0));
				64	const v128_t vi0123x1 = wasm_v128_load(input);
				65	const v128_t vi4567x1 = wasm_v128_load(input + 4);
				66	const v128_t vi89ABx1 = wasm_v128_load(input + 8);
				67	const v128_t viCDEFx1 = wasm_v128_load(input + 12);
				68	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff1);
				69	const v128_t vw1 = wasm_v32x4_load_splat(w);
				70	w += 1;
				71	vacc0123x1 = wasm_f32x4_add(vacc0123x1, wasm_f32x4_mul(vi0123x1, vw1));
				72	vacc4567x1 = wasm_f32x4_add(vacc4567x1, wasm_f32x4_mul(vi4567x1, vw1));
				73	vacc89ABx1 = wasm_f32x4_add(vacc89ABx1, wasm_f32x4_mul(vi89ABx1, vw1));
				74	vaccCDEFx1 = wasm_f32x4_add(vaccCDEFx1, wasm_f32x4_mul(viCDEFx1, vw1));
				75	}
				76	v128_t vacc0123 = vacc0123x0;
				77	v128_t vacc4567 = vacc4567x0;
				78	v128_t vacc89AB = vacc89ABx0;
				79	v128_t vaccCDEF = vaccCDEFx0;
				80	vacc0123 = wasm_f32x4_add(vacc0123, vacc0123x1);
				81	vacc4567 = wasm_f32x4_add(vacc4567, vacc4567x1);
				82	vacc89AB = wasm_f32x4_add(vacc89AB, vacc89ABx1);
				83	vaccCDEF = wasm_f32x4_add(vaccCDEF, vaccCDEFx1);
				84	if XNN_LIKELY(nnz != 0) {
				85	do {
				86	const intptr_t diff = *dmap++;
				87	const v128_t vi0123 = wasm_v128_load(input);
				88	const v128_t vi4567 = wasm_v128_load(input + 4);
				89	const v128_t vi89AB = wasm_v128_load(input + 8);
				90	const v128_t viCDEF = wasm_v128_load(input + 12);
				91	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
				92	const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
				93	vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
				94	vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
				95	vacc89AB = wasm_f32x4_add(vacc89AB, wasm_f32x4_mul(vi89AB, vw));
				96	vaccCDEF = wasm_f32x4_add(vaccCDEF, wasm_f32x4_mul(viCDEF, vw));
				97	} while (--nnz != 0);
				98	}
				99	v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
				100	v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
				101	v128_t vout89AB = wasm_v128_bitselect(vacc89AB, vmax, wasm_f32x4_le(vacc89AB, vmax));
				102	v128_t voutCDEF = wasm_v128_bitselect(vaccCDEF, vmax, wasm_f32x4_le(vaccCDEF, vmax));
				103	vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
				104	vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
				105	vout89AB = wasm_v128_bitselect(vmin, vout89AB, wasm_f32x4_lt(vout89AB, vmin));
				106	voutCDEF = wasm_v128_bitselect(vmin, voutCDEF, wasm_f32x4_lt(voutCDEF, vmin));
				107	wasm_v128_store(output, vout0123);
				108	wasm_v128_store(output + 4, vout4567);
				109	wasm_v128_store(output + 8, vout89AB);
				110	wasm_v128_store(output + 12, voutCDEF);
				111	output += 1 * batch_size;
				112	} while (--c != 0);
				113	output -= batch_size * output_channels;
				114	output += 16;
				115	input += 16;
				116	n -= 16;
				117	}
				118	if XNN_UNLIKELY(n != 0) {
				119	if (n & 8) {
				120	const float*restrict w = weights;
				121	const int32_t* dmap = widx_dmap;
				122	const uint32_t* nnzmap = nidx_nnzmap;
				123	size_t c = output_channels;
				124	do {
				125	uint32_t nnz = *nnzmap++;
				126	v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
				127	v128_t vacc4567 = vacc0123;
				128	if XNN_LIKELY(nnz != 0) {
				129	do {
				130	const intptr_t diff = *dmap++;
				131	const v128_t vi0123 = wasm_v128_load(input);
				132	const v128_t vi4567 = wasm_v128_load(input + 4);
				133	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
				134	const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
				135	vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
				136	vacc4567 = wasm_f32x4_add(vacc4567, wasm_f32x4_mul(vi4567, vw));
				137	} while (--nnz != 0);
				138	}
				139	v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
				140	v128_t vout4567 = wasm_v128_bitselect(vacc4567, vmax, wasm_f32x4_le(vacc4567, vmax));
				141	vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
				142	vout4567 = wasm_v128_bitselect(vmin, vout4567, wasm_f32x4_lt(vout4567, vmin));
				143	wasm_v128_store(output, vout0123);
				144
				145	wasm_v128_store(output + 4, vout4567);
				146	output += 1 * batch_size;
				147	} while (--c != 0);
				148	output -= batch_size * output_channels;
				149	output += 8;
				150	input += 8;
				151	}
				152	if (n & 4) {
				153	const float*restrict w = weights;
				154	const int32_t* dmap = widx_dmap;
				155	const uint32_t* nnzmap = nidx_nnzmap;
				156	size_t c = output_channels;
				157	do {
				158	uint32_t nnz = *nnzmap++;
				159	v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1;
				160	if XNN_LIKELY(nnz != 0) {
				161	do {
				162	const intptr_t diff = *dmap++;
				163	const v128_t vi0123 = wasm_v128_load(input);
				164	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
				165	const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
				166	vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw));
				167	} while (--nnz != 0);
				168	}
				169	v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax));
				170	vout0123 = wasm_v128_bitselect(vmin, vout0123, wasm_f32x4_lt(vout0123, vmin));
				171	wasm_v128_store(output, vout0123);
				172
				173	output += 1 * batch_size;
				174	} while (--c != 0);
				175	output -= batch_size * output_channels;
				176	output += 4;
				177	input += 4;
				178	}
				179	if (n & 2) {
				180	const float*restrict w = weights;
				181	const int32_t* dmap = widx_dmap;
				182	const uint32_t* nnzmap = nidx_nnzmap;
				183	size_t c = output_channels;
				184	do {
				185	uint32_t nnz = *nnzmap++;
				186	v128_t vacc01 = wasm_v32x4_load_splat(w); w += 1;
				187	if XNN_LIKELY(nnz != 0) {
				188	do {
				189	const intptr_t diff = *dmap++;
				190	const v128_t vi01 = wasm_v64x2_load_splat(input);
				191	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
				192	const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
				193	vacc01 = wasm_f32x4_add(vacc01, wasm_f32x4_mul(vi01, vw));
				194	} while (--nnz != 0);
				195	}
				196	v128_t vout01 = wasm_v128_bitselect(vacc01, vmax, wasm_f32x4_le(vacc01, vmax));
				197	vout01 = wasm_v128_bitselect(vmin, vout01, wasm_f32x4_lt(vout01, vmin));
				198	((double) output) = wasm_f64x2_extract_lane(vout01, 0);
				199
				200	output += 1 * batch_size;
				201	} while (--c != 0);
				202	output -= batch_size * output_channels;
				203	output += 2;
				204	input += 2;
				205	}
				206	if (n & 1) {
				207	const float*restrict w = weights;
				208	const int32_t* dmap = widx_dmap;
				209	const uint32_t* nnzmap = nidx_nnzmap;
				210	size_t c = output_channels;
				211	do {
				212	uint32_t nnz = *nnzmap++;
				213	v128_t vacc0 = wasm_v32x4_load_splat(w); w += 1;
				214	if XNN_LIKELY(nnz != 0) {
				215	do {
				216	const intptr_t diff = *dmap++;
				217	const v128_t vi0 = wasm_v32x4_load_splat(input);
				218	input = (const float*restrict) ((uintptr_t) input + (uintptr_t) diff);
				219	const v128_t vw = wasm_v32x4_load_splat(w); w += 1;
				220	vacc0 = wasm_f32x4_add(vacc0, wasm_f32x4_mul(vi0, vw));
				221	} while (--nnz != 0);
				222	}
				223	v128_t vout0 = wasm_v128_bitselect(vacc0, vmax, wasm_f32x4_le(vacc0, vmax));
				224	vout0 = wasm_v128_bitselect(vmin, vout0, wasm_f32x4_lt(vout0, vmin));
				225	*output = wasm_f32x4_extract_lane(vout0, 0);
				226
				227	output += 1 * batch_size;
				228	} while (--c != 0);
				229	output -= batch_size * output_channels;
				230	output += 1;
				231	input += 1;
				232	}
				233	}
				234	}