Blame - src/f32-velu/gen/velu-wasmsimd-arm-rr2-lut16-p3-x8.c - platform/external/XNNPACK

blob: 1902b8f456ba233411375d9493dbe35e137cb720 [file] [log] [blame]

Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/f32-velu/wasmsimd-rr2-lut16-p3.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2020 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <wasm_simd128.h>
				13
				14	#include <xnnpack/vunary.h>
				15	#include <xnnpack/common.h>
				16
				17
				18	extern XNN_INTERNAL const float xnn_table_exp2minus_k_over_16[16];
				19
				20	void xnn_f32_velu_ukernel__wasmsimd_arm_rr2_lut16_p3_x8(
				21	size_t n,
				22	const float* x,
				23	float* y,
				24	const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
				25	{
				26	assert(n != 0);
				27	assert(n % sizeof(float) == 0);
				28	assert(x != NULL);
				29	assert(y != NULL);
				30
				31	const v128_t vprescale = wasm_v32x4_load_splat(&params->scalar.prescale);
				32	const v128_t valpha = wasm_v32x4_load_splat(&params->scalar.alpha);
				33	const v128_t vbeta = wasm_v32x4_load_splat(&params->scalar.beta);
				34
				35	const v128_t vsat_cutoff = wasm_f32x4_splat(-0x1.154246p+4f);
				36	const v128_t vmagic_bias = wasm_f32x4_splat(0x1.800000p19f);
				37	const v128_t vlog2e = wasm_f32x4_splat(0x1.715476p+0f);
				38	const v128_t vindex_mask = wasm_i32x4_splat(0xF);
				39	const v128_t vminus_ln2_hi = wasm_f32x4_splat(-0x1.62E400p-1f);
				40	const v128_t vminus_ln2_lo = wasm_f32x4_splat(-0x1.7F7D1Cp-20f);
				41	const v128_t vc3 = wasm_f32x4_splat(0x1.55561Cp-3f);
				42	const v128_t vc2 = wasm_f32x4_splat(0x1.0001ECp-1f);
				43	const v128_t vone = wasm_f32x4_splat(1.0f);
				44
				45	for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
				46	v128_t vx0123 = wasm_v128_load(x);
				47	v128_t vx4567 = wasm_v128_load(x + 4);
				48	x += 8;
				49
				50	const v128_t vz0123 = wasm_f32x4_max(wasm_f32x4_mul(vx0123, vprescale), vsat_cutoff);
				51	const v128_t vz4567 = wasm_f32x4_max(wasm_f32x4_mul(vx4567, vprescale), vsat_cutoff);
				52
				53	v128_t vn0123 = wasm_f32x4_add(wasm_f32x4_mul(vz0123, vlog2e), vmagic_bias);
				54	v128_t vn4567 = wasm_f32x4_add(wasm_f32x4_mul(vz4567, vlog2e), vmagic_bias);
				55
				56	const v128_t vidx0123 = wasm_i32x4_shl(wasm_v128_and(vn0123, vindex_mask), 2);
				57	const v128_t ven0123 = wasm_i32x4_shl(vn0123, 19);
				58	const v128_t vidx4567 = wasm_i32x4_shl(wasm_v128_and(vn4567, vindex_mask), 2);
				59	const v128_t ven4567 = wasm_i32x4_shl(vn4567, 19);
				60
				61	const uint64_t vidx01 = wasm_i64x2_extract_lane(vidx0123, 0);
				62	const uint64_t vidx23 = wasm_i64x2_extract_lane(vidx0123, 1);
				63	const float vl0 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx01));
				64	const float vl1 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx01 >> 32)));
				65	const float vl2 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx23));
				66	const float vl3 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx23 >> 32)));
				67	const v128_t vl0123 = wasm_f32x4_make(vl0, vl1, vl2, vl3);
				68	const uint64_t vidx45 = wasm_i64x2_extract_lane(vidx4567, 0);
				69	const uint64_t vidx67 = wasm_i64x2_extract_lane(vidx4567, 1);
				70	const float vl4 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx45));
				71	const float vl5 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx45 >> 32)));
				72	const float vl6 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx67));
				73	const float vl7 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx67 >> 32)));
				74	const v128_t vl4567 = wasm_f32x4_make(vl4, vl5, vl6, vl7);
				75
				76	vn0123 = wasm_f32x4_sub(vn0123, vmagic_bias);
				77	v128_t vs0123 = wasm_i32x4_add(vl0123, ven0123);
				78	vn4567 = wasm_f32x4_sub(vn4567, vmagic_bias);
				79	v128_t vs4567 = wasm_i32x4_add(vl4567, ven4567);
				80
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	81	v128_t vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_hi), vz0123);
				82	v128_t vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_hi), vz4567);
				83
				84	vt0123 = wasm_f32x4_add(wasm_f32x4_mul(vn0123, vminus_ln2_lo), vt0123);
				85	vt4567 = wasm_f32x4_add(wasm_f32x4_mul(vn4567, vminus_ln2_lo), vt4567);
				86
				87	v128_t vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vc3, vt0123), vc2);
				88	v128_t vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vc3, vt4567), vc2);
				89
				90	vp0123 = wasm_f32x4_mul(vp0123, vt0123);
				91	vp4567 = wasm_f32x4_mul(vp4567, vt4567);
				92
				93	vt0123 = wasm_f32x4_mul(vt0123, vs0123);
				94	vs0123 = wasm_f32x4_sub(vs0123, vone);
				95	vt4567 = wasm_f32x4_mul(vt4567, vs4567);
				96	vs4567 = wasm_f32x4_sub(vs4567, vone);
				97
				98	vp0123 = wasm_f32x4_add(wasm_f32x4_mul(vp0123, vt0123), vt0123);
				99	vp4567 = wasm_f32x4_add(wasm_f32x4_mul(vp4567, vt4567), vt4567);
				100
				101	const v128_t ve0123 = wasm_f32x4_mul(wasm_f32x4_add(vp0123, vs0123), valpha);
				102	const v128_t ve4567 = wasm_f32x4_mul(wasm_f32x4_add(vp4567, vs4567), valpha);
				103
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	104	const v128_t vsignm0123 = wasm_i32x4_shr(vx0123, 31);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	105	vx0123 = wasm_f32x4_mul(vx0123, vbeta);
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	106	const v128_t vsignm4567 = wasm_i32x4_shr(vx4567, 31);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	107	vx4567 = wasm_f32x4_mul(vx4567, vbeta);
				108
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	109	const v128_t vy0123 = wasm_v128_bitselect(ve0123, vx0123, vsignm0123);
				110	const v128_t vy4567 = wasm_v128_bitselect(ve4567, vx4567, vsignm4567);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	111
				112	wasm_v128_store(y, vy0123);
				113	wasm_v128_store(y + 4, vy4567);
				114	y += 8;
				115	}
				116	for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
				117	v128_t vx = wasm_v128_load(x);
				118	x += 4;
				119
				120	const v128_t vz = wasm_f32x4_max(wasm_f32x4_mul(vx, vprescale), vsat_cutoff);
				121
				122	v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias);
				123	const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2);
				124	const v128_t ven = wasm_i32x4_shl(vn, 19);
				125
				126	const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0);
				127	const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1);
				128	const float vl0 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo));
				129	const float vl1 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)));
				130	const float vl2 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi));
				131	const float vl3 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)));
				132	const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3);
				133
				134	v128_t vs = wasm_i32x4_add(vl, ven);
				135	vn = wasm_f32x4_sub(vn, vmagic_bias);
				136
				137	v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz);
				138	vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt);
				139
				140	v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc3, vt), vc2);
				141	vp = wasm_f32x4_mul(vp, vt);
				142
				143	vt = wasm_f32x4_mul(vt, vs);
				144	vs = wasm_f32x4_sub(vs, vone);
				145	vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt);
				146	const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha);
				147
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	148	const v128_t vsignm = wasm_i32x4_shr(vx, 31);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	149	vx = wasm_f32x4_mul(vx, vbeta);
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	150	const v128_t vy = wasm_v128_bitselect(ve, vx, vsignm);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	151
				152	wasm_v128_store(y, vy);
				153	y += 4;
				154	}
				155	if XNN_UNLIKELY(n != 0) {
				156	v128_t vx = wasm_v128_load(x);
				157
				158	const v128_t vz = wasm_f32x4_max(wasm_f32x4_mul(vx, vprescale), vsat_cutoff);
				159
				160	v128_t vn = wasm_f32x4_add(wasm_f32x4_mul(vz, vlog2e), vmagic_bias);
				161	const v128_t vidx = wasm_i32x4_shl(wasm_v128_and(vn, vindex_mask), 2);
				162	const v128_t ven = wasm_i32x4_shl(vn, 19);
				163
				164	const uint64_t vidx_lo = wasm_i64x2_extract_lane(vidx, 0);
				165	const uint64_t vidx_hi = wasm_i64x2_extract_lane(vidx, 1);
				166	const float vl0 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_lo));
				167	const float vl1 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_lo >> 32)));
				168	const float vl2 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) vidx_hi));
				169	const float vl3 = ((const float) ((uintptr_t) xnn_table_exp2minus_k_over_16 + (uint32_t) (vidx_hi >> 32)));
				170	const v128_t vl = wasm_f32x4_make(vl0, vl1, vl2, vl3);
				171
				172	v128_t vs = wasm_i32x4_add(vl, ven);
				173	vn = wasm_f32x4_sub(vn, vmagic_bias);
				174
				175	v128_t vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_hi), vz);
				176	vt = wasm_f32x4_add(wasm_f32x4_mul(vn, vminus_ln2_lo), vt);
				177
				178	v128_t vp = wasm_f32x4_add(wasm_f32x4_mul(vc3, vt), vc2);
				179	vp = wasm_f32x4_mul(vp, vt);
				180
				181	vt = wasm_f32x4_mul(vt, vs);
				182	vs = wasm_f32x4_sub(vs, vone);
				183	vp = wasm_f32x4_add(wasm_f32x4_mul(vp, vt), vt);
				184	const v128_t ve = wasm_f32x4_mul(wasm_f32x4_add(vp, vs), valpha);
				185
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	186	const v128_t vsignm = wasm_i32x4_shr(vx, 31);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	187	vx = wasm_f32x4_mul(vx, vbeta);
Marat Dukhan	e332dd6	2020-12-14 14:31:54 -0800	[diff] [blame]	188	v128_t vy = wasm_v128_bitselect(ve, vx, vsignm);
Marat Dukhan	ed6baaf	2020-12-01 15:07:08 -0800	[diff] [blame]	189
				190	if (n & (2 * sizeof(float))) {
				191	((double) y) = wasm_f64x2_extract_lane(vy, 0);
				192	vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
				193	y += 2;
				194	}
				195	if (n & (1 * sizeof(float))) {
				196	*y = wasm_f32x4_extract_lane(vy, 0);
				197	}
				198	}
				199	}