blob: 47dce15ba5f42f3f47ba0b414c04bd8bdedb0858 [file] [log] [blame]
flimc91ee5b2016-01-26 14:33:44 +01001/* Copyright (c) 2014-2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3/**
4 @file celt_neon_intr.c
5 @brief ARM Neon Intrinsic optimizations for celt
6 */
7
8/*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31*/
32
33#ifdef HAVE_CONFIG_H
34#include "config.h"
35#endif
36
37#include <arm_neon.h>
38#include "../pitch.h"
39
40#if !defined(FIXED_POINT)
41/*
42 * Function: xcorr_kernel_neon_float
43 * ---------------------------------
44 * Computes 4 correlation values and stores them in sum[4]
45 */
46static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
47 float32_t sum[4], int len) {
48 float32x4_t YY[3];
49 float32x4_t YEXT[3];
50 float32x4_t XX[2];
51 float32x2_t XX_2;
52 float32x4_t SUMM;
53 const float32_t *xi = x;
54 const float32_t *yi = y;
55
56 celt_assert(len>0);
57
58 YY[0] = vld1q_f32(yi);
59 SUMM = vdupq_n_f32(0);
60
61 /* Consume 8 elements in x vector and 12 elements in y
62 * vector. However, the 12'th element never really gets
63 * touched in this loop. So, if len == 8, then we only
64 * must access y[0] to y[10]. y[11] must not be accessed
65 * hence make sure len > 8 and not len >= 8
66 */
67 while (len > 8) {
68 yi += 4;
69 YY[1] = vld1q_f32(yi);
70 yi += 4;
71 YY[2] = vld1q_f32(yi);
72
73 XX[0] = vld1q_f32(xi);
74 xi += 4;
75 XX[1] = vld1q_f32(xi);
76 xi += 4;
77
78 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
79 YEXT[0] = vextq_f32(YY[0], YY[1], 1);
80 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
81 YEXT[1] = vextq_f32(YY[0], YY[1], 2);
82 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
83 YEXT[2] = vextq_f32(YY[0], YY[1], 3);
84 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
85
86 SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
87 YEXT[0] = vextq_f32(YY[1], YY[2], 1);
88 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
89 YEXT[1] = vextq_f32(YY[1], YY[2], 2);
90 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
91 YEXT[2] = vextq_f32(YY[1], YY[2], 3);
92 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
93
94 YY[0] = YY[2];
95 len -= 8;
96 }
97
98 /* Consume 4 elements in x vector and 8 elements in y
99 * vector. However, the 8'th element in y never really gets
100 * touched in this loop. So, if len == 4, then we only
101 * must access y[0] to y[6]. y[7] must not be accessed
102 * hence make sure len>4 and not len>=4
103 */
104 if (len > 4) {
105 yi += 4;
106 YY[1] = vld1q_f32(yi);
107
108 XX[0] = vld1q_f32(xi);
109 xi += 4;
110
111 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
112 YEXT[0] = vextq_f32(YY[0], YY[1], 1);
113 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
114 YEXT[1] = vextq_f32(YY[0], YY[1], 2);
115 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
116 YEXT[2] = vextq_f32(YY[0], YY[1], 3);
117 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
118
119 YY[0] = YY[1];
120 len -= 4;
121 }
122
123 while (--len > 0) {
124 XX_2 = vld1_dup_f32(xi++);
125 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
126 YY[0]= vld1q_f32(++yi);
127 }
128
129 XX_2 = vld1_dup_f32(xi);
130 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
131
132 vst1q_f32(sum, SUMM);
133}
134
135/*
136 * Function: xcorr_kernel_neon_float_process1
137 * ---------------------------------
138 * Computes single correlation values and stores in *sum
139 */
140static void xcorr_kernel_neon_float_process1(const float32_t *x,
141 const float32_t *y, float32_t *sum, int len) {
142 float32x4_t XX[4];
143 float32x4_t YY[4];
144 float32x2_t XX_2;
145 float32x2_t YY_2;
146 float32x4_t SUMM;
147 float32x2_t SUMM_2[2];
148 const float32_t *xi = x;
149 const float32_t *yi = y;
150
151 SUMM = vdupq_n_f32(0);
152
153 /* Work on 16 values per iteration */
154 while (len >= 16) {
155 XX[0] = vld1q_f32(xi);
156 xi += 4;
157 XX[1] = vld1q_f32(xi);
158 xi += 4;
159 XX[2] = vld1q_f32(xi);
160 xi += 4;
161 XX[3] = vld1q_f32(xi);
162 xi += 4;
163
164 YY[0] = vld1q_f32(yi);
165 yi += 4;
166 YY[1] = vld1q_f32(yi);
167 yi += 4;
168 YY[2] = vld1q_f32(yi);
169 yi += 4;
170 YY[3] = vld1q_f32(yi);
171 yi += 4;
172
173 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
174 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
175 SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
176 SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
177 len -= 16;
178 }
179
180 /* Work on 8 values */
181 if (len >= 8) {
182 XX[0] = vld1q_f32(xi);
183 xi += 4;
184 XX[1] = vld1q_f32(xi);
185 xi += 4;
186
187 YY[0] = vld1q_f32(yi);
188 yi += 4;
189 YY[1] = vld1q_f32(yi);
190 yi += 4;
191
192 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
193 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
194 len -= 8;
195 }
196
197 /* Work on 4 values */
198 if (len >= 4) {
199 XX[0] = vld1q_f32(xi);
200 xi += 4;
201 YY[0] = vld1q_f32(yi);
202 yi += 4;
203 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
204 len -= 4;
205 }
206
207 /* Start accumulating results */
208 SUMM_2[0] = vget_low_f32(SUMM);
209 if (len >= 2) {
210 /* While at it, consume 2 more values if available */
211 XX_2 = vld1_f32(xi);
212 xi += 2;
213 YY_2 = vld1_f32(yi);
214 yi += 2;
215 SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
216 len -= 2;
217 }
218 SUMM_2[1] = vget_high_f32(SUMM);
219 SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
220 SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
221 /* Ok, now we have result accumulated in SUMM_2[0].0 */
222
223 if (len > 0) {
224 /* Case when you have one value left */
225 XX_2 = vld1_dup_f32(xi);
226 YY_2 = vld1_dup_f32(yi);
227 SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
228 }
229
230 vst1_lane_f32(sum, SUMM_2[0], 0);
231}
232
233void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
234 opus_val32 *xcorr, int len, int max_pitch) {
235 int i;
236 celt_assert(max_pitch > 0);
237 celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
238
239 for (i = 0; i < (max_pitch-3); i += 4) {
240 xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
241 (float32_t *)xcorr+i, len);
242 }
243
244 /* In case max_pitch isn't multiple of 4
245 * compute single correlation value per iteration
246 */
247 for (; i < max_pitch; i++) {
248 xcorr_kernel_neon_float_process1((const float32_t *)_x,
249 (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
250 }
251}
252#endif