Blame - src/f32-vsqrt/neonfma-nr1rsqrts1fma1adj.c.in - platform/external/XNNPACK

blob: 2daa5f172d6e75a50d29883e1cafa85a06e75b0d [file] [log] [blame]

Marat Dukhan	f4db2f3	2020-06-30 10:55:30 -0700	[diff] [blame]	1	// Copyright 2020 Google LLC
				2	//
				3	// This source code is licensed under the BSD-style license found in the
				4	// LICENSE file in the root directory of this source tree.
				5
				6	$assert BATCH_TILE % 4 == 0
				7	$assert BATCH_TILE >= 4
				8	$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
				9	#include <assert.h>
				10	#include <math.h>
				11
				12	#include <arm_neon.h>
				13
				14	#include <xnnpack/common.h>
				15	#include <xnnpack/vunary.h>
				16
				17
				18	void xnn_f32_vsqrt_ukernel__neonfma_nr1rsqrts1fma1adj_x${BATCH_TILE}(
				19	size_t n,
				20	const float* x,
				21	float* y,
				22	const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
				23	{
				24	assert(n != 0);
				25	assert(n % sizeof(float) == 0);
				26
				27	$if BATCH_TILE > 4:
				28	const float32x4_t vhalf = vmovq_n_f32(0.5f);
				29	for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
				30	$for N in range(0, BATCH_TILE, 4):
				31	const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
				32
				33	$for N in range(0, BATCH_TILE, 4):
				34	float32x4_t vrsqrtx${ABC[N:N+4]} = vrsqrteq_f32(vx${ABC[N:N+4]});
				35
				36	$for N in range(0, BATCH_TILE, 4):
				37	const float32x4_t vrx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vrsqrtx${ABC[N:N+4]});
				38
				39	$for N in range(0, BATCH_TILE, 4):
				40	const float32x4_t vcorrection${ABC[N:N+4]} = vrsqrtsq_f32(vx${ABC[N:N+4]}, vrx${ABC[N:N+4]});
				41
				42	$for N in range(0, BATCH_TILE, 4):
				43	vrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vcorrection${ABC[N:N+4]});
				44
				45	$for N in range(0, BATCH_TILE, 4):
				46	float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]});
				47	float32x4_t vhalfrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vhalf);
				48
				49	$for N in range(0, BATCH_TILE, 4):
				50	const float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]});
				51
				52	$for N in range(0, BATCH_TILE, 4):
				53	vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]});
				54	vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]});
				55
				56	$for N in range(0, BATCH_TILE, 4):
				57	const float32x4_t vadjustment${ABC[N:N+4]} = vfmsq_f32(vx${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]});
				58
				59	$for N in range(0, BATCH_TILE, 4):
				60	const float32x4_t vy${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}, vadjustment${ABC[N:N+4]});
				61
				62	$for N in range(0, BATCH_TILE, 4):
				63	vst1q_f32(y, vy${ABC[N:N+4]}); y += 4;
				64	}
				65	if XNN_UNLIKELY(n != 0) {
				66	do {
				67	const float vx = *x++;
				68	const float vy = sqrtf(vx);
				69	*y++ = vy;
				70	n -= sizeof(float);
				71	} while (n != 0);
				72	}
				73	}