blob: 97ee9eae59b1b5fad0ce48b098a8c6b7e99c60d5 [file] [log] [blame]
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +10001/* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2013 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#if HAVE_CONFIG_H
34# include <config.h>
35#endif
36
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +100037#ifndef FLAC__INTEGER_ONLY_LIBRARY
38#ifndef FLAC__NO_ASM
39#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
Erik de Castro Lopod40e9862014-01-30 21:49:51 +110040#include "private/lpc.h"
41#ifdef FLAC__SSE4_1_SUPPORTED
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +100042
43#include "FLAC/assert.h"
44#include "FLAC/format.h"
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +100045
46#include <smmintrin.h> /* SSE4.1 */
47
48#ifdef FLAC__CPU_IA32
49#if defined _MSC_VER || defined __INTEL_COMPILER
50#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(xmmN.m128i_i64[0] >> lp_quantization);
51#define DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(xmmN.m128i_i64[0] >> lp_quantization);
52#else
53#define RESIDUAL_RESULT(xmmN) { \
54 FLAC__int64 tmp[2]; \
55 _mm_storel_epi64((__m128i *)tmp, xmmN); \
56 residual[i] = data[i] - (FLAC__int32)(tmp[0] >> lp_quantization); \
57 }
58#define DATA_RESULT(xmmN) { \
59 FLAC__int64 tmp[2]; \
60 _mm_storel_epi64((__m128i *)tmp, xmmN); \
61 data[i] = residual[i] + (FLAC__int32)(tmp[0] >> lp_quantization); \
62 }
63#endif
64#else
65#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
66#define DATA_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
67#endif
68
Erik de Castro Lopod40e9862014-01-30 21:49:51 +110069FLAC__SSE_TARGET("sse4.1")
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +100070void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
71{
72 int i;
73
74 FLAC__ASSERT(order > 0);
75 FLAC__ASSERT(order <= 32);
76
77 if(order <= 12) {
78 if(order > 8) { /* order == 9, 10, 11, 12 */
79 if(order > 10) { /* order == 11, 12 */
80 if(order == 12) {
81 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
82 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
83 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
84 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
85 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
86 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
87 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
88
89 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
90 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
91 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
92 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
93 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
94 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
95
96 for(i = 0; i < (int)data_len; i++) {
97 //sum = 0;
98 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
99 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
100 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
101 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
102 xmm7 = _mm_mul_epi32(xmm7, xmm5);
103
104 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
105 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
106 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
107 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
108 xmm6 = _mm_mul_epi32(xmm6, xmm4);
109 xmm7 = _mm_add_epi64(xmm7, xmm6);
110
111 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
112 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
113 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
114 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
115 xmm6 = _mm_mul_epi32(xmm6, xmm3);
116 xmm7 = _mm_add_epi64(xmm7, xmm6);
117
118 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
119 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
120 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
121 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
122 xmm6 = _mm_mul_epi32(xmm6, xmm2);
123 xmm7 = _mm_add_epi64(xmm7, xmm6);
124
125 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
126 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
127 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
128 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
129 xmm6 = _mm_mul_epi32(xmm6, xmm1);
130 xmm7 = _mm_add_epi64(xmm7, xmm6);
131
132 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
133 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
134 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
135 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
136 xmm6 = _mm_mul_epi32(xmm6, xmm0);
137 xmm7 = _mm_add_epi64(xmm7, xmm6);
138
139 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
140 RESIDUAL_RESULT(xmm7);
141 }
142 }
143 else { /* order == 11 */
144 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
145 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
146 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
147 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
148 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
149 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
150 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
151
152 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
153 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
154 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
155 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
156 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
157
158 for(i = 0; i < (int)data_len; i++) {
159 //sum = 0;
160 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11];
161 xmm7 = _mm_cvtsi32_si128(data[i-11]);
162 xmm7 = _mm_mul_epi32(xmm7, xmm5);
163
164 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
165 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
166 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
167 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
168 xmm6 = _mm_mul_epi32(xmm6, xmm4);
169 xmm7 = _mm_add_epi64(xmm7, xmm6);
170
171 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
172 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
173 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
174 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
175 xmm6 = _mm_mul_epi32(xmm6, xmm3);
176 xmm7 = _mm_add_epi64(xmm7, xmm6);
177
178 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
179 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
180 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
181 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
182 xmm6 = _mm_mul_epi32(xmm6, xmm2);
183 xmm7 = _mm_add_epi64(xmm7, xmm6);
184
185 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
186 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
187 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
188 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
189 xmm6 = _mm_mul_epi32(xmm6, xmm1);
190 xmm7 = _mm_add_epi64(xmm7, xmm6);
191
192 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
193 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
194 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
195 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
196 xmm6 = _mm_mul_epi32(xmm6, xmm0);
197 xmm7 = _mm_add_epi64(xmm7, xmm6);
198
199 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
200 RESIDUAL_RESULT(xmm7);
201 }
202 }
203 }
204 else { /* order == 9, 10 */
205 if(order == 10) {
206 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
207 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
208 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
209 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
210 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
211 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
212
213 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
214 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
215 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
216 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
217 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
218
219 for(i = 0; i < (int)data_len; i++) {
220 //sum = 0;
221 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
222 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
223 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
224 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
225 xmm7 = _mm_mul_epi32(xmm7, xmm4);
226
227 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
228 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
229 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
230 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
231 xmm6 = _mm_mul_epi32(xmm6, xmm3);
232 xmm7 = _mm_add_epi64(xmm7, xmm6);
233
234 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
235 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
236 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
237 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
238 xmm6 = _mm_mul_epi32(xmm6, xmm2);
239 xmm7 = _mm_add_epi64(xmm7, xmm6);
240
241 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
242 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
243 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
244 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
245 xmm6 = _mm_mul_epi32(xmm6, xmm1);
246 xmm7 = _mm_add_epi64(xmm7, xmm6);
247
248 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
249 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
250 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
251 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
252 xmm6 = _mm_mul_epi32(xmm6, xmm0);
253 xmm7 = _mm_add_epi64(xmm7, xmm6);
254
255 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
256 RESIDUAL_RESULT(xmm7);
257 }
258 }
259 else { /* order == 9 */
260 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
261 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
262 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
263 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
264 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
265 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
266
267 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
268 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
269 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
270 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
271
272 for(i = 0; i < (int)data_len; i++) {
273 //sum = 0;
274 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9];
275 xmm7 = _mm_cvtsi32_si128(data[i-9]);
276 xmm7 = _mm_mul_epi32(xmm7, xmm4);
277
278 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
279 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
280 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
281 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
282 xmm6 = _mm_mul_epi32(xmm6, xmm3);
283 xmm7 = _mm_add_epi64(xmm7, xmm6);
284
285 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
286 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
287 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
288 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
289 xmm6 = _mm_mul_epi32(xmm6, xmm2);
290 xmm7 = _mm_add_epi64(xmm7, xmm6);
291
292 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
293 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
294 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
295 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
296 xmm6 = _mm_mul_epi32(xmm6, xmm1);
297 xmm7 = _mm_add_epi64(xmm7, xmm6);
298
299 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
300 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
301 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
302 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
303 xmm6 = _mm_mul_epi32(xmm6, xmm0);
304 xmm7 = _mm_add_epi64(xmm7, xmm6);
305
306 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
307 RESIDUAL_RESULT(xmm7);
308 }
309 }
310 }
311 }
312 else if(order > 4) { /* order == 5, 6, 7, 8 */
313 if(order > 6) { /* order == 7, 8 */
314 if(order == 8) {
315 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
316 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
317 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
318 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
319 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
320
321 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
322 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
323 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
324 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
325
326 for(i = 0; i < (int)data_len; i++) {
327 //sum = 0;
328 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
329 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
330 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
331 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
332 xmm7 = _mm_mul_epi32(xmm7, xmm3);
333
334 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
335 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
336 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
337 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
338 xmm6 = _mm_mul_epi32(xmm6, xmm2);
339 xmm7 = _mm_add_epi64(xmm7, xmm6);
340
341 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
342 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
343 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
344 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
345 xmm6 = _mm_mul_epi32(xmm6, xmm1);
346 xmm7 = _mm_add_epi64(xmm7, xmm6);
347
348 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
349 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
350 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
351 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
352 xmm6 = _mm_mul_epi32(xmm6, xmm0);
353 xmm7 = _mm_add_epi64(xmm7, xmm6);
354
355 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
356 RESIDUAL_RESULT(xmm7);
357 }
358 }
359 else { /* order == 7 */
360 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
361 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
362 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
363 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
364 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
365
366 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
367 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
368 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
369
370 for(i = 0; i < (int)data_len; i++) {
371 //sum = 0;
372 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7];
373 xmm7 = _mm_cvtsi32_si128(data[i-7]);
374 xmm7 = _mm_mul_epi32(xmm7, xmm3);
375
376 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
377 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
378 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
379 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
380 xmm6 = _mm_mul_epi32(xmm6, xmm2);
381 xmm7 = _mm_add_epi64(xmm7, xmm6);
382
383 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
384 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
385 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
386 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
387 xmm6 = _mm_mul_epi32(xmm6, xmm1);
388 xmm7 = _mm_add_epi64(xmm7, xmm6);
389
390 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
391 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
392 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
393 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
394 xmm6 = _mm_mul_epi32(xmm6, xmm0);
395 xmm7 = _mm_add_epi64(xmm7, xmm6);
396
397 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
398 RESIDUAL_RESULT(xmm7);
399 }
400 }
401 }
402 else { /* order == 5, 6 */
403 if(order == 6) {
404 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
405 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
406 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
407 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
408
409 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
410 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
411 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
412
413 for(i = 0; i < (int)data_len; i++) {
414 //sum = 0;
415 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
416 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
417 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
418 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
419 xmm7 = _mm_mul_epi32(xmm7, xmm2);
420
421 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
422 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
423 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
424 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
425 xmm6 = _mm_mul_epi32(xmm6, xmm1);
426 xmm7 = _mm_add_epi64(xmm7, xmm6);
427
428 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
429 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
430 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
431 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
432 xmm6 = _mm_mul_epi32(xmm6, xmm0);
433 xmm7 = _mm_add_epi64(xmm7, xmm6);
434
435 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
436 RESIDUAL_RESULT(xmm7);
437 }
438 }
439 else { /* order == 5 */
440 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
441 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
442 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
443 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
444
445 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
446 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
447
448 for(i = 0; i < (int)data_len; i++) {
449 //sum = 0;
450 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5];
451 xmm7 = _mm_cvtsi32_si128(data[i-5]);
452 xmm7 = _mm_mul_epi32(xmm7, xmm2);
453
454 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
455 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
456 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
457 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
458 xmm6 = _mm_mul_epi32(xmm6, xmm1);
459 xmm7 = _mm_add_epi64(xmm7, xmm6);
460
461 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
462 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
463 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
464 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
465 xmm6 = _mm_mul_epi32(xmm6, xmm0);
466 xmm7 = _mm_add_epi64(xmm7, xmm6);
467
468 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
469 RESIDUAL_RESULT(xmm7);
470 }
471 }
472 }
473 }
474 else { /* order == 1, 2, 3, 4 */
475 if(order > 2) { /* order == 3, 4 */
476 if(order == 4) {
477 __m128i xmm0, xmm1, xmm6, xmm7;
478 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
479 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
480
481 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
482 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
483
484 for(i = 0; i < (int)data_len; i++) {
485 //sum = 0;
486 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
487 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
488 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
489 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
490 xmm7 = _mm_mul_epi32(xmm7, xmm1);
491
492 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
493 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
494 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
495 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
496 xmm6 = _mm_mul_epi32(xmm6, xmm0);
497 xmm7 = _mm_add_epi64(xmm7, xmm6);
498
499 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
500 RESIDUAL_RESULT(xmm7);
501 }
502 }
503 else { /* order == 3 */
504 __m128i xmm0, xmm1, xmm6, xmm7;
505 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
506 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
507
508 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
509
510 for(i = 0; i < (int)data_len; i++) {
511 //sum = 0;
512 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3];
513 xmm7 = _mm_cvtsi32_si128(data[i-3]);
514 xmm7 = _mm_mul_epi32(xmm7, xmm1);
515
516 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
517 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
518 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
519 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
520 xmm6 = _mm_mul_epi32(xmm6, xmm0);
521 xmm7 = _mm_add_epi64(xmm7, xmm6);
522
523 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
524 RESIDUAL_RESULT(xmm7);
525 }
526 }
527 }
528 else { /* order == 1, 2 */
529 if(order == 2) {
530 __m128i xmm0, xmm7;
531 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
532 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
533
534 for(i = 0; i < (int)data_len; i++) {
535 //sum = 0;
536 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
537 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
538 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
539 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
540 xmm7 = _mm_mul_epi32(xmm7, xmm0);
541
542 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
543 RESIDUAL_RESULT(xmm7);
544 }
545 }
546 else { /* order == 1 */
547 for(i = 0; i < (int)data_len; i++)
548 residual[i] = data[i] - (FLAC__int32)((qlp_coeff[0] * (FLAC__int64)data[i-1]) >> lp_quantization);
549 }
550 }
551 }
552 }
553 else { /* order > 12 */
554 FLAC__int64 sum;
555 for(i = 0; i < (int)data_len; i++) {
556 sum = 0;
557 switch(order) {
558 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
559 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
560 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
561 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
562 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
563 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
564 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
565 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
566 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
567 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
568 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
569 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
570 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
571 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
572 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
573 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
574 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
575 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
576 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
577 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
578 sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
579 sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
580 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
581 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
582 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
583 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
584 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
585 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
586 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
587 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
588 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
589 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
590 }
591 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
592 }
593 }
594}
595
Erik de Castro Lopod40e9862014-01-30 21:49:51 +1100596FLAC__SSE_TARGET("sse4.1")
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +1000597void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
598{
599 int i;
600
601 FLAC__ASSERT(order > 0);
602 FLAC__ASSERT(order <= 32);
603
604 if(order <= 12) {
605 if(order > 8) { /* order == 9, 10, 11, 12 */
606 if(order > 10) { /* order == 11, 12 */
607 if(order == 12) {
608 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
609 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
610 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
611 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
612 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
613 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
614 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
615
616 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
617 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
618 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
619 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
620 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
621 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
622
623 for(i = 0; i < (int)data_len; i++) {
624 //sum = 0;
625 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
626 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
627 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
628 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
629 xmm7 = _mm_mul_epi32(xmm7, xmm5);
630
631 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
632 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
633 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
634 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
635 xmm6 = _mm_mul_epi32(xmm6, xmm4);
636 xmm7 = _mm_add_epi64(xmm7, xmm6);
637
638 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
639 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
640 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
641 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
642 xmm6 = _mm_mul_epi32(xmm6, xmm3);
643 xmm7 = _mm_add_epi64(xmm7, xmm6);
644
645 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
646 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
647 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
648 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
649 xmm6 = _mm_mul_epi32(xmm6, xmm2);
650 xmm7 = _mm_add_epi64(xmm7, xmm6);
651
652 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
653 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
654 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
655 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
656 xmm6 = _mm_mul_epi32(xmm6, xmm1);
657 xmm7 = _mm_add_epi64(xmm7, xmm6);
658
659 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
660 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
661 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
662 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
663 xmm6 = _mm_mul_epi32(xmm6, xmm0);
664 xmm7 = _mm_add_epi64(xmm7, xmm6);
665
666 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
667 DATA_RESULT(xmm7);
668 }
669 }
670 else { /* order == 11 */
671 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
672 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
673 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
674 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
675 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
676 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
677 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
678
679 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
680 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
681 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
682 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
683 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
684
685 for(i = 0; i < (int)data_len; i++) {
686 //sum = 0;
687 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11];
688 xmm7 = _mm_cvtsi32_si128(data[i-11]);
689 xmm7 = _mm_mul_epi32(xmm7, xmm5);
690
691 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
692 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
693 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
694 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
695 xmm6 = _mm_mul_epi32(xmm6, xmm4);
696 xmm7 = _mm_add_epi64(xmm7, xmm6);
697
698 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
699 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
700 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
701 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
702 xmm6 = _mm_mul_epi32(xmm6, xmm3);
703 xmm7 = _mm_add_epi64(xmm7, xmm6);
704
705 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
706 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
707 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
708 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
709 xmm6 = _mm_mul_epi32(xmm6, xmm2);
710 xmm7 = _mm_add_epi64(xmm7, xmm6);
711
712 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
713 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
714 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
715 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
716 xmm6 = _mm_mul_epi32(xmm6, xmm1);
717 xmm7 = _mm_add_epi64(xmm7, xmm6);
718
719 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
720 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
721 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
722 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
723 xmm6 = _mm_mul_epi32(xmm6, xmm0);
724 xmm7 = _mm_add_epi64(xmm7, xmm6);
725
726 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
727 DATA_RESULT(xmm7);
728 }
729 }
730 }
731 else { /* order == 9, 10 */
732 if(order == 10) {
733 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
734 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
735 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
736 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
737 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
738 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
739
740 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
741 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
742 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
743 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
744 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
745
746 for(i = 0; i < (int)data_len; i++) {
747 //sum = 0;
748 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
749 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
750 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
751 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
752 xmm7 = _mm_mul_epi32(xmm7, xmm4);
753
754 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
755 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
756 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
757 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
758 xmm6 = _mm_mul_epi32(xmm6, xmm3);
759 xmm7 = _mm_add_epi64(xmm7, xmm6);
760
761 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
762 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
763 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
764 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
765 xmm6 = _mm_mul_epi32(xmm6, xmm2);
766 xmm7 = _mm_add_epi64(xmm7, xmm6);
767
768 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
769 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
770 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
771 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
772 xmm6 = _mm_mul_epi32(xmm6, xmm1);
773 xmm7 = _mm_add_epi64(xmm7, xmm6);
774
775 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
776 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
777 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
778 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
779 xmm6 = _mm_mul_epi32(xmm6, xmm0);
780 xmm7 = _mm_add_epi64(xmm7, xmm6);
781
782 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
783 DATA_RESULT(xmm7);
784 }
785 }
786 else { /* order == 9 */
787 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
788 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
789 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
790 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
791 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
792 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
793
794 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
795 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
796 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
797 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
798
799 for(i = 0; i < (int)data_len; i++) {
800 //sum = 0;
801 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9];
802 xmm7 = _mm_cvtsi32_si128(data[i-9]);
803 xmm7 = _mm_mul_epi32(xmm7, xmm4);
804
805 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
806 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
807 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
808 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
809 xmm6 = _mm_mul_epi32(xmm6, xmm3);
810 xmm7 = _mm_add_epi64(xmm7, xmm6);
811
812 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
813 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
814 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
815 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
816 xmm6 = _mm_mul_epi32(xmm6, xmm2);
817 xmm7 = _mm_add_epi64(xmm7, xmm6);
818
819 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
820 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
821 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
822 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
823 xmm6 = _mm_mul_epi32(xmm6, xmm1);
824 xmm7 = _mm_add_epi64(xmm7, xmm6);
825
826 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
827 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
828 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
829 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
830 xmm6 = _mm_mul_epi32(xmm6, xmm0);
831 xmm7 = _mm_add_epi64(xmm7, xmm6);
832
833 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
834 DATA_RESULT(xmm7);
835 }
836 }
837 }
838 }
839 else if(order > 4) { /* order == 5, 6, 7, 8 */
840 if(order > 6) { /* order == 7, 8 */
841 if(order == 8) {
842 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
843 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
844 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
845 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
846 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
847
848 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
849 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
850 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
851 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
852
853 for(i = 0; i < (int)data_len; i++) {
854 //sum = 0;
855 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
856 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
857 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
858 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
859 xmm7 = _mm_mul_epi32(xmm7, xmm3);
860
861 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
862 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
863 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
864 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
865 xmm6 = _mm_mul_epi32(xmm6, xmm2);
866 xmm7 = _mm_add_epi64(xmm7, xmm6);
867
868 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
869 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
870 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
871 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
872 xmm6 = _mm_mul_epi32(xmm6, xmm1);
873 xmm7 = _mm_add_epi64(xmm7, xmm6);
874
875 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
876 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
877 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
878 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
879 xmm6 = _mm_mul_epi32(xmm6, xmm0);
880 xmm7 = _mm_add_epi64(xmm7, xmm6);
881
882 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
883 DATA_RESULT(xmm7);
884 }
885 }
886 else { /* order == 7 */
887 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
888 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
889 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
890 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
891 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
892
893 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
894 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
895 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
896
897 for(i = 0; i < (int)data_len; i++) {
898 //sum = 0;
899 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7];
900 xmm7 = _mm_cvtsi32_si128(data[i-7]);
901 xmm7 = _mm_mul_epi32(xmm7, xmm3);
902
903 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
904 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
905 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
906 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
907 xmm6 = _mm_mul_epi32(xmm6, xmm2);
908 xmm7 = _mm_add_epi64(xmm7, xmm6);
909
910 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
911 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
912 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
913 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
914 xmm6 = _mm_mul_epi32(xmm6, xmm1);
915 xmm7 = _mm_add_epi64(xmm7, xmm6);
916
917 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
918 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
919 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
920 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
921 xmm6 = _mm_mul_epi32(xmm6, xmm0);
922 xmm7 = _mm_add_epi64(xmm7, xmm6);
923
924 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
925 DATA_RESULT(xmm7);
926 }
927 }
928 }
929 else { /* order == 5, 6 */
930 if(order == 6) {
931 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
932 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
933 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
934 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
935
936 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
937 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
938 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
939
940 for(i = 0; i < (int)data_len; i++) {
941 //sum = 0;
942 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
943 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
944 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
945 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
946 xmm7 = _mm_mul_epi32(xmm7, xmm2);
947
948 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
949 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
950 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
951 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
952 xmm6 = _mm_mul_epi32(xmm6, xmm1);
953 xmm7 = _mm_add_epi64(xmm7, xmm6);
954
955 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
956 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
957 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
958 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
959 xmm6 = _mm_mul_epi32(xmm6, xmm0);
960 xmm7 = _mm_add_epi64(xmm7, xmm6);
961
962 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
963 DATA_RESULT(xmm7);
964 }
965 }
966 else { /* order == 5 */
967 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
968 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
969 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
970 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
971
972 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
973 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
974
975 for(i = 0; i < (int)data_len; i++) {
976 //sum = 0;
977 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5];
978 xmm7 = _mm_cvtsi32_si128(data[i-5]);
979 xmm7 = _mm_mul_epi32(xmm7, xmm2);
980
981 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
982 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
983 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
984 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
985 xmm6 = _mm_mul_epi32(xmm6, xmm1);
986 xmm7 = _mm_add_epi64(xmm7, xmm6);
987
988 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
989 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
990 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
991 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
992 xmm6 = _mm_mul_epi32(xmm6, xmm0);
993 xmm7 = _mm_add_epi64(xmm7, xmm6);
994
995 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
996 DATA_RESULT(xmm7);
997 }
998 }
999 }
1000 }
1001 else { /* order == 1, 2, 3, 4 */
1002 if(order > 2) { /* order == 3, 4 */
1003 if(order == 4) {
1004 __m128i xmm0, xmm1, xmm6, xmm7;
1005 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1006 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1007
1008 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1009 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1010
1011 for(i = 0; i < (int)data_len; i++) {
1012 //sum = 0;
1013 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
1014 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
1015 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1016 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1017 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1018
1019 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1020 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1021 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1022 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1023 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1024 xmm7 = _mm_add_epi64(xmm7, xmm6);
1025
1026 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1027 DATA_RESULT(xmm7);
1028 }
1029 }
1030 else { /* order == 3 */
1031 __m128i xmm0, xmm1, xmm6, xmm7;
1032 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1033 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1034
1035 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1036
1037 for(i = 0; i < (int)data_len; i++) {
1038 //sum = 0;
1039 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3];
1040 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1041 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1042
1043 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1044 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1045 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1046 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1047 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1048 xmm7 = _mm_add_epi64(xmm7, xmm6);
1049
1050 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1051 DATA_RESULT(xmm7);
1052 }
1053 }
1054 }
1055 else { /* order == 1, 2 */
1056 if(order == 2) {
1057 __m128i xmm0, xmm7;
1058 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1059 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1060
1061 for(i = 0; i < (int)data_len; i++) {
1062 //sum = 0;
1063 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1064 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1065 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1066 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1067 xmm7 = _mm_mul_epi32(xmm7, xmm0);
1068
1069 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1070 DATA_RESULT(xmm7);
1071 }
1072 }
1073 else { /* order == 1 */
1074 for(i = 0; i < (int)data_len; i++)
1075 data[i] = residual[i] + (FLAC__int32)((qlp_coeff[0] * (FLAC__int64)data[i-1]) >> lp_quantization);
1076 }
1077 }
1078 }
1079 }
1080 else { /* order > 12 */
1081 FLAC__int64 sum;
1082 for(i = 0; i < (int)data_len; i++) {
1083 sum = 0;
1084 switch(order) {
1085 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1086 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1087 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1088 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1089 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1090 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1091 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1092 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1093 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1094 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1095 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1096 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1097 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1098 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1099 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1100 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1101 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1102 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1103 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1104 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1105 sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1106 sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1107 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1108 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1109 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1110 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1111 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1112 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1113 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1114 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1115 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1116 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1117 }
1118 data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1119 }
1120 }
1121}
1122
Erik de Castro Lopod40e9862014-01-30 21:49:51 +11001123#endif /* FLAC__SSE4_1_SUPPORTED */
Erik de Castro Lopoecd0acb2013-10-04 01:38:00 +10001124#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1125#endif /* FLAC__NO_ASM */
1126#endif /* FLAC__INTEGER_ONLY_LIBRARY */