Blame - src/math/sqrt-sse-nr1mac.c - platform/external/XNNPACK

blob: 1253d2757f5ce387efa58bc0347224fdfccd3362 [file] [log] [blame]

Marat Dukhan	8400076	2020-06-29 18:38:43 -0700	[diff] [blame]	1	// Copyright 2020 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <assert.h>
				7	#include <stddef.h>
				8
				9	#include <xmmintrin.h>
				10
				11	#include <xnnpack/math.h>
				12	#include <xnnpack/math-stubs.h>
				13
				14
				15	void xnn_math_f32_sqrt__sse_nr1mac(
				16	size_t n,
				17	const float* input,
				18	float* output)
				19	{
				20	assert(n % (4 * sizeof(float)) == 0);
				21
				22	const __m128 vthree_halfs = _mm_set1_ps(1.5f);
				23	const __m128 vhalf = _mm_set1_ps(0.5f);
				24	for (; n != 0; n -= 4 * sizeof(float)) {
				25	const __m128 vx = _mm_load_ps(input);
				26	input += 4;
				27
				28	// Initial approximation
				29	__m128 vrsqrtx = _mm_rsqrt_ps(vx);
				30	const __m128 vhalfx = _mm_mul_ps(vx, vhalf);
				31
				32	// Netwon-Raphson iteration: rsqrt_x <- rsqrt_x * (3/2 - x/2 * rsqrt_x * rsqrt_x)
				33	// Note: (half_x * rsqrt_x) * rsqrt_x is less accurate than half_x * (rsqrt_x * rsqrt_x)
				34	vrsqrtx = _mm_mul_ps(vrsqrtx, _mm_sub_ps(vthree_halfs, _mm_mul_ps(vhalfx, _mm_mul_ps(vrsqrtx, vrsqrtx))));
				35
				36	// Reconstruct sqrt(x) = rsqrt(x) * x
				37	const __m128 vy = _mm_mul_ps(vrsqrtx, vx);
				38
				39	_mm_store_ps(output, vy);
				40	output += 4;
				41	}
				42	}