Blame - src/math/sqrt-avx512f-nr1fma.c - platform/external/XNNPACK

blob: 32a0d9853f008ec395b5a7fab5ff76e13c94dda7 [file] [log] [blame]

Marat Dukhan	8400076	2020-06-29 18:38:43 -0700	[diff] [blame]	1	// Copyright 2020 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	#include <assert.h>
				7	#include <stddef.h>
				8
				9	#include <immintrin.h>
				10
				11	#include <xnnpack/math.h>
				12	#include <xnnpack/math-stubs.h>
				13
				14
				15	void xnn_math_f32_sqrt__avx512f_nr1fma(
				16	size_t n,
				17	const float* input,
				18	float* output)
				19	{
				20	assert(n % (16 * sizeof(float)) == 0);
				21
				22	const __m512 vhalf = _mm512_set1_ps(0.5f);
				23	for (; n != 0; n -= 16 * sizeof(float)) {
				24	const __m512 vx = _mm512_load_ps(input);
				25	input += 16;
				26
				27	// Initial approximation
				28	const __m512 vrsqrtx = _mm512_rsqrt14_ps(vx);
				29	__m512 vsqrtx = _mm512_mul_ps(vrsqrtx, vx);
				30	const __m512 vhalfrsqrtx = _mm512_mul_ps(vrsqrtx, vhalf);
				31
				32	// Netwon-Raphson iteration:
				33	// residual <- 0.5 - sqrtx * halfrsqrtx
				34	// sqrtx <- sqrtx + sqrtx * residual
				35	const __m512 vresidual = _mm512_fnmadd_ps(vsqrtx, vhalfrsqrtx, vhalf);
				36	vsqrtx = _mm512_fmadd_ps(vsqrtx, vresidual, vsqrtx);
				37
				38	const __m512 vy = vsqrtx;
				39
				40	_mm512_store_ps(output, vy);
				41	output += 16;
				42	}
				43	}