Blame - src/f32-raddstoreexpminusmax/gen/sse2-p5-x4.c - platform/external/XNNPACK

blob: 84ea2198e2659d72e9d5f7994b43f6dce1de1a59 [file] [log] [blame]

Marat Dukhan	b39689d	2020-01-24 13:32:20 -0800	[diff] [blame]	1	// Auto-generated file. Do not edit!
				2	// Template: src/f32-raddstoreexpminusmax/sse2-p5.c.in
				3	// Generator: tools/xngen
				4	//
				5	// Copyright 2019 Google LLC
				6	//
				7	// This source code is licensed under the BSD-style license found in the
				8	// LICENSE file in the root directory of this source tree.
				9
				10	#include <assert.h>
				11
				12	#include <emmintrin.h>
				13
				14	#include <xnnpack/common.h>
				15	#include <xnnpack/raddstoreexpminusmax.h>
				16
				17
				18	void xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x4(
				19	size_t elements,
				20	const float* input,
				21	float* output,
				22	float* sum,
Marat Dukhan	b2217dd	2020-05-28 17:30:28 -0700	[diff] [blame]	23	float max) XNN_DISABLE_TSAN
Marat Dukhan	b39689d	2020-01-24 13:32:20 -0800	[diff] [blame]	24	{
				25	assert(elements % sizeof(float) == 0);
				26
				27	const __m128 vmagic_bias = _mm_set1_ps(0x1.8000FEp23f);
				28	// The smallest x for which expf(x) is normalized.
				29	const __m128 vdenorm_cutoff = _mm_set1_ps(-0x1.5D589Ep6f);
				30	const __m128 vlog2e = _mm_set1_ps(0x1.715476p+0f);
				31	// Last 7 bits are zeroes
				32	const __m128 vminus_ln2_hi = _mm_set1_ps(-0x1.62E400p-1f);
				33	const __m128 vminus_ln2_lo = _mm_set1_ps(-0x1.7F7D1Cp-20f);
				34
				35	const __m128 vc1 = _mm_set1_ps(0x1.FFFFF6p-1f);
				36	const __m128 vc2 = _mm_set1_ps(0x1.FFFDC6p-2f);
				37	const __m128 vc3 = _mm_set1_ps(0x1.555A80p-3f);
				38	const __m128 vc4 = _mm_set1_ps(0x1.573A1Ap-5f);
				39	const __m128 vc5 = _mm_set1_ps(0x1.0F9F9Cp-7f);
				40
				41	const __m128 vi_max = _mm_set1_ps(max);
				42
				43	__m128 vacc0 = _mm_setzero_ps();
				44	for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
				45	// Load 4 (1x4) inputs at a time.
				46	const __m128 vi0123 = _mm_loadu_ps(input);
				47	input += 4;
				48
				49	// Subtract maximum input x := i - i_max. This implies x <= 0.
				50	const __m128 vx0123 = _mm_sub_ps(vi0123, vi_max);
				51
				52	// Compute reduced argument elements := round(x / log(2)).
				53	__m128 vn0123 = _mm_add_ps(_mm_mul_ps(vx0123, vlog2e), vmagic_bias);
				54
				55	// Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
				56	// -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
				57	const __m128 vs0123 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn0123), 23));
				58
				59	// Subtract the large number back to get final elements := round(x / log(2)).
				60	vn0123 = _mm_sub_ps(vn0123, vmagic_bias);
				61
				62	// Compute reduced argument t := x - elements * log(2).
				63	// Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
				64	__m128 vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_hi), vx0123);
				65
				66	vt0123 = _mm_add_ps(_mm_mul_ps(vn0123, vminus_ln2_lo), vt0123);
				67
Marat Dukhan	102a739	2020-11-20 01:18:10 -0800	[diff] [blame^]	68	// Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
Marat Dukhan	b39689d	2020-01-24 13:32:20 -0800	[diff] [blame]	69	__m128 vp0123 = _mm_add_ps(_mm_mul_ps(vc5, vt0123), vc4);
				70
				71	vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc3);
				72
				73	vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc2);
				74
				75	vp0123 = _mm_add_ps(_mm_mul_ps(vp0123, vt0123), vc1);
				76
				77	// Reconstruct the final f value:
				78	// f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
				79	// = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
				80	// = s + (t * s) * p
				81	vt0123 = _mm_mul_ps(vt0123, vs0123);
				82
				83	__m128 vf0123 = _mm_add_ps(_mm_mul_ps(vt0123, vp0123), vs0123);
				84
				85	// For inputs below zero cutoff, replace output with +0.0f.
				86	// Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
				87	vf0123 = _mm_andnot_ps(_mm_cmplt_ps(vx0123, vdenorm_cutoff), vf0123);
				88
				89	// Store 4 (1x4) outputs at a time.
				90	_mm_storeu_ps(output, vf0123);
				91	output += 4;
				92
				93	// Accumulate computed exponents.
				94	vacc0 = _mm_add_ps(vacc0, vf0123);
				95	}
				96
				97	__m128 vacc = vacc0;
				98	for (; elements >= 4 * sizeof(float); elements -= 4 * sizeof(float)) {
				99	// Load 4 inputs at a time.
				100	const __m128 vi = _mm_loadu_ps(input);
				101	input += 4;
				102
				103	// Subtract maximum input x := i - i_max. This implies x <= 0.
				104	const __m128 vx = _mm_sub_ps(vi, vi_max);
				105
				106	// Compute reduced argument elements := round(x / log(2)).
				107	__m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
				108
				109	// Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
				110	// -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
				111	const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
				112
				113	// Subtract the large number back to get final elements := round(x / log(2)).
				114	vn = _mm_sub_ps(vn, vmagic_bias);
				115
				116	// Compute reduced argument t := x - elements * log(2).
				117	// Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
				118	__m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
				119	vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
				120
Marat Dukhan	102a739	2020-11-20 01:18:10 -0800	[diff] [blame^]	121	// Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
Marat Dukhan	b39689d	2020-01-24 13:32:20 -0800	[diff] [blame]	122	__m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
				123	vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
				124	vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
				125	vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
				126
				127	// Reconstruct the final f value:
				128	// f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
				129	// = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
				130	// = s + (t * s) * p
				131	vt = _mm_mul_ps(vt, vs);
				132	__m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
				133
				134	// For inputs below zero cutoff, replace output with +0.0f.
				135	// Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
				136	vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
				137
				138	// Store 4 outputs at a time.
				139	_mm_storeu_ps(output, vf);
				140	output += 4;
				141
				142	// Accumulate computed exponents.
				143	vacc = _mm_add_ps(vacc, vf);
				144	}
				145	if (elements != 0) {
				146	assert(elements >= 1 * sizeof(float));
				147	assert(elements <= 3 * sizeof(float));
				148	// Load 4 inputs at a time.
Marat Dukhan	b2217dd	2020-05-28 17:30:28 -0700	[diff] [blame]	149	const __m128 vi = _mm_loadu_ps(input);
Marat Dukhan	b39689d	2020-01-24 13:32:20 -0800	[diff] [blame]	150
				151	// Subtract maximum input x := i - i_max. This implies x <= 0.
				152	const __m128 vx = _mm_sub_ps(vi, vi_max);
				153
				154	// Compute reduced argument elements := round(x / log(2)).
				155	__m128 vn = _mm_add_ps(_mm_mul_ps(vx, vlog2e), vmagic_bias);
				156
				157	// Create a floating-point number s (scale) such that s == 2**elements for inputs which don't cause underflow, i.e.
				158	// -87.33642 <= x <= 0.0, and -126 <= elements <= 0 accordingly.
				159	const __m128 vs = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(vn), 23));
				160
				161	// Subtract the large number back to get final elements := round(x / log(2)).
				162	vn = _mm_sub_ps(vn, vmagic_bias);
				163
				164	// Compute reduced argument t := x - elements * log(2).
				165	// Use Cody-Waite range reduction method (note two constants to represent log(2)) to improve accuracy.
				166	__m128 vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_hi), vx);
				167	vt = _mm_add_ps(_mm_mul_ps(vn, vminus_ln2_lo), vt);
				168
Marat Dukhan	102a739	2020-11-20 01:18:10 -0800	[diff] [blame^]	169	// Compute degree-5 polynomial approximation for exp(t) on [-log(2)/2, log(2)/2].
Marat Dukhan	b39689d	2020-01-24 13:32:20 -0800	[diff] [blame]	170	__m128 vp = _mm_add_ps(_mm_mul_ps(vc5, vt), vc4);
				171	vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc3);
				172	vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc2);
				173	vp = _mm_add_ps(_mm_mul_ps(vp, vt), vc1);
				174
				175	// Reconstruct the final f value:
				176	// f = s * (1 + t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5)))))
				177	// = s + (t * s) * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))))
				178	// = s + (t * s) * p
				179	vt = _mm_mul_ps(vt, vs);
				180	__m128 vf = _mm_add_ps(_mm_mul_ps(vt, vp), vs);
				181
				182	// For inputs below zero cutoff, replace output with +0.0f.
				183	// Note that for NaN inputs, comparison result is false, and outputs are left unchanged.
				184	vf = _mm_andnot_ps(_mm_cmplt_ps(vx, vdenorm_cutoff), vf);
				185
				186	if (elements & (2 * sizeof(float))) {
				187	// Store 2 outputs at a time.
				188	_mm_storel_pi((__m64*) output, vf);
				189	output += 2;
				190
				191	// Accumulate 2 computed exponents.
				192	vacc = _mm_add_ps(vacc, _mm_movelh_ps(vf, _mm_setzero_ps()));
				193
				194	vf = _mm_movehl_ps(vf, vf);
				195	}
				196	if (elements & (1 * sizeof(float))) {
				197	// Store 1 output at a time.
				198	_mm_store_ss(output, vf);
				199
				200	// Accumulate 1 computed exponent.
				201	vacc = _mm_add_ss(vacc, vf);
				202	}
				203	}
				204	// Reduce 4 elements in the SIMD register
				205	vacc = _mm_add_ps(vacc, _mm_movehl_ps(vacc, vacc));
				206	vacc = _mm_add_ss(vacc, _mm_shuffle_ps(vacc, vacc, _MM_SHUFFLE(2, 3, 0, 1)));
				207	_mm_store_ss(sum, vacc);
				208	}