Blame - celt/arm/celt_neon_intr.c - platform/external/libopus

blob: 47dce15ba5f42f3f47ba0b414c04bd8bdedb0858 [file] [log] [blame]

flim	c91ee5b	2016-01-26 14:33:44 +0100	[diff] [blame]	1	/* Copyright (c) 2014-2015 Xiph.Org Foundation
				2	Written by Viswanath Puttagunta */
				3	/**
				4	@file celt_neon_intr.c
				5	@brief ARM Neon Intrinsic optimizations for celt
				6	*/
				7
				8	/*
				9	Redistribution and use in source and binary forms, with or without
				10	modification, are permitted provided that the following conditions
				11	are met:
				12
				13	- Redistributions of source code must retain the above copyright
				14	notice, this list of conditions and the following disclaimer.
				15
				16	- Redistributions in binary form must reproduce the above copyright
				17	notice, this list of conditions and the following disclaimer in the
				18	documentation and/or other materials provided with the distribution.
				19
				20	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				21	``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				22	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				23	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
				24	OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				25	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				26	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				27	PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				28	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				29	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				30	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				31	*/
				32
				33	#ifdef HAVE_CONFIG_H
				34	#include "config.h"
				35	#endif
				36
				37	#include <arm_neon.h>
				38	#include "../pitch.h"
				39
				40	#if !defined(FIXED_POINT)
				41	/*
				42	* Function: xcorr_kernel_neon_float
				43	* ---------------------------------
				44	* Computes 4 correlation values and stores them in sum[4]
				45	*/
				46	static void xcorr_kernel_neon_float(const float32_t x, const float32_t y,
				47	float32_t sum[4], int len) {
				48	float32x4_t YY[3];
				49	float32x4_t YEXT[3];
				50	float32x4_t XX[2];
				51	float32x2_t XX_2;
				52	float32x4_t SUMM;
				53	const float32_t *xi = x;
				54	const float32_t *yi = y;
				55
				56	celt_assert(len>0);
				57
				58	YY[0] = vld1q_f32(yi);
				59	SUMM = vdupq_n_f32(0);
				60
				61	/* Consume 8 elements in x vector and 12 elements in y
				62	* vector. However, the 12'th element never really gets
				63	* touched in this loop. So, if len == 8, then we only
				64	* must access y[0] to y[10]. y[11] must not be accessed
				65	* hence make sure len > 8 and not len >= 8
				66	*/
				67	while (len > 8) {
				68	yi += 4;
				69	YY[1] = vld1q_f32(yi);
				70	yi += 4;
				71	YY[2] = vld1q_f32(yi);
				72
				73	XX[0] = vld1q_f32(xi);
				74	xi += 4;
				75	XX[1] = vld1q_f32(xi);
				76	xi += 4;
				77
				78	SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
				79	YEXT[0] = vextq_f32(YY[0], YY[1], 1);
				80	SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
				81	YEXT[1] = vextq_f32(YY[0], YY[1], 2);
				82	SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
				83	YEXT[2] = vextq_f32(YY[0], YY[1], 3);
				84	SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
				85
				86	SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
				87	YEXT[0] = vextq_f32(YY[1], YY[2], 1);
				88	SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
				89	YEXT[1] = vextq_f32(YY[1], YY[2], 2);
				90	SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
				91	YEXT[2] = vextq_f32(YY[1], YY[2], 3);
				92	SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
				93
				94	YY[0] = YY[2];
				95	len -= 8;
				96	}
				97
				98	/* Consume 4 elements in x vector and 8 elements in y
				99	* vector. However, the 8'th element in y never really gets
				100	* touched in this loop. So, if len == 4, then we only
				101	* must access y[0] to y[6]. y[7] must not be accessed
				102	* hence make sure len>4 and not len>=4
				103	*/
				104	if (len > 4) {
				105	yi += 4;
				106	YY[1] = vld1q_f32(yi);
				107
				108	XX[0] = vld1q_f32(xi);
				109	xi += 4;
				110
				111	SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
				112	YEXT[0] = vextq_f32(YY[0], YY[1], 1);
				113	SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
				114	YEXT[1] = vextq_f32(YY[0], YY[1], 2);
				115	SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
				116	YEXT[2] = vextq_f32(YY[0], YY[1], 3);
				117	SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
				118
				119	YY[0] = YY[1];
				120	len -= 4;
				121	}
				122
				123	while (--len > 0) {
				124	XX_2 = vld1_dup_f32(xi++);
				125	SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
				126	YY[0]= vld1q_f32(++yi);
				127	}
				128
				129	XX_2 = vld1_dup_f32(xi);
				130	SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
				131
				132	vst1q_f32(sum, SUMM);
				133	}
				134
				135	/*
				136	* Function: xcorr_kernel_neon_float_process1
				137	* ---------------------------------
				138	* Computes single correlation values and stores in *sum
				139	*/
				140	static void xcorr_kernel_neon_float_process1(const float32_t *x,
				141	const float32_t y, float32_t sum, int len) {
				142	float32x4_t XX[4];
				143	float32x4_t YY[4];
				144	float32x2_t XX_2;
				145	float32x2_t YY_2;
				146	float32x4_t SUMM;
				147	float32x2_t SUMM_2[2];
				148	const float32_t *xi = x;
				149	const float32_t *yi = y;
				150
				151	SUMM = vdupq_n_f32(0);
				152
				153	/* Work on 16 values per iteration */
				154	while (len >= 16) {
				155	XX[0] = vld1q_f32(xi);
				156	xi += 4;
				157	XX[1] = vld1q_f32(xi);
				158	xi += 4;
				159	XX[2] = vld1q_f32(xi);
				160	xi += 4;
				161	XX[3] = vld1q_f32(xi);
				162	xi += 4;
				163
				164	YY[0] = vld1q_f32(yi);
				165	yi += 4;
				166	YY[1] = vld1q_f32(yi);
				167	yi += 4;
				168	YY[2] = vld1q_f32(yi);
				169	yi += 4;
				170	YY[3] = vld1q_f32(yi);
				171	yi += 4;
				172
				173	SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
				174	SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
				175	SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
				176	SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
				177	len -= 16;
				178	}
				179
				180	/* Work on 8 values */
				181	if (len >= 8) {
				182	XX[0] = vld1q_f32(xi);
				183	xi += 4;
				184	XX[1] = vld1q_f32(xi);
				185	xi += 4;
				186
				187	YY[0] = vld1q_f32(yi);
				188	yi += 4;
				189	YY[1] = vld1q_f32(yi);
				190	yi += 4;
				191
				192	SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
				193	SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
				194	len -= 8;
				195	}
				196
				197	/* Work on 4 values */
				198	if (len >= 4) {
				199	XX[0] = vld1q_f32(xi);
				200	xi += 4;
				201	YY[0] = vld1q_f32(yi);
				202	yi += 4;
				203	SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
				204	len -= 4;
				205	}
				206
				207	/* Start accumulating results */
				208	SUMM_2[0] = vget_low_f32(SUMM);
				209	if (len >= 2) {
				210	/* While at it, consume 2 more values if available */
				211	XX_2 = vld1_f32(xi);
				212	xi += 2;
				213	YY_2 = vld1_f32(yi);
				214	yi += 2;
				215	SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
				216	len -= 2;
				217	}
				218	SUMM_2[1] = vget_high_f32(SUMM);
				219	SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
				220	SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
				221	/* Ok, now we have result accumulated in SUMM_2[0].0 */
				222
				223	if (len > 0) {
				224	/* Case when you have one value left */
				225	XX_2 = vld1_dup_f32(xi);
				226	YY_2 = vld1_dup_f32(yi);
				227	SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
				228	}
				229
				230	vst1_lane_f32(sum, SUMM_2[0], 0);
				231	}
				232
				233	void celt_pitch_xcorr_float_neon(const opus_val16 _x, const opus_val16 _y,
				234	opus_val32 *xcorr, int len, int max_pitch) {
				235	int i;
				236	celt_assert(max_pitch > 0);
				237	celt_assert((((unsigned char )_x-(unsigned char )NULL)&3)==0);
				238
				239	for (i = 0; i < (max_pitch-3); i += 4) {
				240	xcorr_kernel_neon_float((const float32_t )_x, (const float32_t )_y+i,
				241	(float32_t *)xcorr+i, len);
				242	}
				243
				244	/* In case max_pitch isn't multiple of 4
				245	* compute single correlation value per iteration
				246	*/
				247	for (; i < max_pitch; i++) {
				248	xcorr_kernel_neon_float_process1((const float32_t *)_x,
				249	(const float32_t )_y+i, (float32_t )xcorr+i, len);
				250	}
				251	}
				252	#endif