Blame - src/f32-gemm/wasmsimd-s4.c.in - platform/external/XNNPACK

blob: f92657799a1079b8afbe4aef7f39a3aeea71d157 [file] [log] [blame]

Marat Dukhan	1bbf96b	2020-06-15 23:01:20 -0700	[diff] [blame^]	1	// Copyright 2020 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	$assert NR % 4 == 0
				7	$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
				8	#include <assert.h>
				9
				10	#include <wasm_simd128.h>
				11
				12	#include <xnnpack/gemm.h>
				13
				14
				15	void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_${MR}x${NR}s4__wasmsimd_${"x86" if X86 else "arm"}(
				16	size_t mr,
				17	size_t nc,
				18	size_t kc,
				19	const float*restrict a,
				20	size_t a_stride,
				21	const float*restrict w,
				22	float*restrict c,
				23	size_t cm_stride,
				24	size_t cn_stride,
				25	$if INC:
				26	const float*restrict acc,
				27	const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
				28	{
				29	assert(mr != 0);
				30	assert(mr <= ${MR});
				31	assert(nc != 0);
				32	assert(kc != 0);
				33	assert(kc % sizeof(float) == 0);
				34	assert(a != NULL);
				35	assert(w != NULL);
				36	assert(c != NULL);
				37	$if INC:
				38	assert(acc != NULL);
				39
				40	const float* a0 = a;
				41	float* c0 = c;
				42	$for M in range(1, MR):
				43	const float* a${M} = (const float*) ((uintptr_t) a${M-1} + a_stride);
				44	float* c${M} = (float*) ((uintptr_t) c${M-1} + cm_stride);
				45	$if M % 2 == 0:
				46	if XNN_UNPREDICTABLE(mr <= ${M}) {
				47	a${M} = a${M-1};
				48	c${M} = c${M-1};
				49	}
				50	$elif M + 1 == MR:
				51	if XNN_UNPREDICTABLE(mr != ${M+1}) {
				52	a${M} = a${M-1};
				53	c${M} = c${M-1};
				54	}
				55	$else:
				56	if XNN_UNPREDICTABLE(mr < ${M+1}) {
				57	a${M} = a${M-1};
				58	c${M} = c${M-1};
				59	}
				60
				61	do {
				62	$if INC:
				63	$for M in range(MR):
				64	$for N in range(0, NR, 4):
				65	v128_t vacc${M}x${ABC[N:N+4]} = wasm_v128_load(acc + ${M*NR+N});
				66	acc += ${MR*NR};
				67	$else:
				68	$for N in range(0, NR, 4):
				69	v128_t vacc0x${ABC[N:N+4]} = wasm_v128_load(w + ${N});
				70	$for M in range(1, MR):
				71	$for N in range(0, NR, 4):
				72	v128_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
				73	w += ${NR};
				74
				75	size_t k = kc;
				76	while (k >= 4 * sizeof(float)) {
				77	$for M in range(MR):
				78	v128_t va${M} = wasm_v128_load(a${M});
				79	a${M} += 4;
				80
				81	$for L in range(4):
				82
				83	$for N in range(0, NR, 4):
				84	const v128_t vb${ABC[N:N+4]}c${L} = wasm_v128_load(w + ${L * NR + N});
				85
				86	$for N in range(0, NR, 4):
				87	$for M in range(MR):
				88	vacc${M}x${ABC[N:N+4]} = wasm_f32x4_add(vacc${M}x${ABC[N:N+4]}, wasm_f32x4_mul(va${M}, vb${ABC[N:N+4]}c${L}));
				89
				90	$if L + 1 != 4:
				91	$for M in range(MR):
				92	va${M} = wasm_v32x4_shuffle(va${M}, va${M}, 1, 2, 3, 0);
				93
				94	w += ${4 * NR};
				95	k -= 4 * sizeof(float);
				96	}
				97	if XNN_UNLIKELY(k != 0) {
				98	do {
				99	$for M in range(MR):
				100	const v128_t va${M} = wasm_v32x4_load_splat(a${M});
				101	a${M} += 1;
				102
				103	const v128_t vb${ABC[0:4]} = wasm_v128_load(w);
				104	$for N in range(4, NR, 4):
				105	const v128_t vb${ABC[N:N+4]} = wasm_v128_load(w + ${N});
				106	w += ${NR};
				107
				108	$for N in range(0, NR, 4):
				109	$for M in range(MR):
				110	vacc${M}x${ABC[N:N+4]} = wasm_f32x4_add(vacc${M}x${ABC[N:N+4]}, wasm_f32x4_mul(va${M}, vb${ABC[N:N+4]}));
				111
				112	k -= sizeof(float);
				113	} while (k != 0);
				114	}
				115
				116	const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
				117	$for N in range(0, NR, 4):
				118	$for M in range(MR):
				119	$if X86:
				120	vacc${M}x${ABC[N:N+4]} = wasm_v128_bitselect(vacc${M}x${ABC[N:N+4]}, vmax, wasm_f32x4_le(vacc${M}x${ABC[N:N+4]}, vmax));
				121	$else:
				122	vacc${M}x${ABC[N:N+4]} = wasm_f32x4_min(vacc${M}x${ABC[N:N+4]}, vmax);
				123
				124	const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
				125	$for N in range(0, NR, 4):
				126	$for M in range(MR):
				127	$if X86:
				128	vacc${M}x${ABC[N:N+4]} = wasm_v128_bitselect(vmin, vacc${M}x${ABC[N:N+4]}, wasm_f32x4_lt(vacc${M}x${ABC[N:N+4]}, vmin));
				129	$else:
				130	vacc${M}x${ABC[N:N+4]} = wasm_f32x4_max(vacc${M}x${ABC[N:N+4]}, vmin);
				131
				132	if XNN_LIKELY(nc >= ${NR}) {
				133	$for M in reversed(range(MR)):
				134	wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]});
				135	$for N in range(4, NR, 4):
				136	wasm_v128_store(c${M} + ${N}, vacc${M}x${ABC[N:N+4]});
				137	c${M} = (float*) ((uintptr_t) c${M} + cn_stride);
				138
				139	$for M in reversed(range(MR)):
				140	a${M} = (const float*) ((uintptr_t) a${M} - kc);
				141
				142	nc -= ${NR};
				143	} else {
				144	$for LOG2N in reversed(range(NR.bit_length())):
				145	$if NR != 1 << LOG2N:
				146	if (nc & ${1 << LOG2N}) {
				147	$if LOG2N >= 2:
				148	$for M in reversed(range(MR)):
				149	wasm_v128_store(c${M}, vacc${M}x${ABC[0:4]});
				150	$for N in range(4, 1 << LOG2N, 4):
				151	wasm_v128_store(c${M} + ${N}, vacc${M}x${ABC[N:N+4]});
				152
				153	$for M in reversed(range(MR)):
				154	$for N in range(0, 1 << (LOG2N - 1), 4):
				155	vacc${M}x${ABC[N:N+4]} = vacc${M}x${ABC[N + (1 << LOG2N):N + (1 << LOG2N)+4]};
				156
				157	$for M in reversed(range(MR)):
				158	c${M} += ${1 << LOG2N};
				159	$elif LOG2N == 1:
				160	$for M in reversed(range(MR)):
				161	((double) c${M}) = wasm_f64x2_extract_lane(vacc${M}x${ABC[0:4]}, 0);
				162
				163	$for M in reversed(range(MR)):
				164	vacc${M}x${ABC[0:4]} = wasm_v32x4_shuffle(vacc${M}x${ABC[0:4]}, vacc${M}x${ABC[0:4]}, 2, 3, 2, 3);
				165
				166	$for M in reversed(range(MR)):
				167	c${M} += 2;
				168	$elif LOG2N == 0:
				169	$for M in reversed(range(MR)):
				170	*c${M} = wasm_f32x4_extract_lane(vacc${M}x${ABC[0:4]}, 0);
				171	}
				172
				173	nc = 0;
				174	}
				175	} while (nc != 0);
				176	}