blob: 6785f92a90d4333539bd0228c3c23a480d6c5fed [file] [log] [blame]
Erik de Castro Lopo619d8212014-04-12 07:13:08 +10001/* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2014 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33#ifdef HAVE_CONFIG_H
34# include <config.h>
35#endif
36
37#ifndef FLAC__INTEGER_ONLY_LIBRARY
38#ifndef FLAC__NO_ASM
Erik de Castro Lopob8d58e32014-06-15 20:29:34 +100039#if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
Erik de Castro Lopo619d8212014-04-12 07:13:08 +100040#include "private/fixed.h"
41#ifdef FLAC__SSE2_SUPPORTED
42
43#include <emmintrin.h> /* SSE2 */
44#include <math.h>
45#include "private/macros.h"
46#include "share/compat.h"
47#include "FLAC/assert.h"
48
49#ifdef FLAC__CPU_IA32
Erik de Castro Lopo4fa58802014-09-18 21:39:36 +100050#define m128i_to_i64(dest, src) _mm_storel_epi64((__m128i*)&dest, src)
51#else
52#define m128i_to_i64(dest, src) dest = _mm_cvtsi128_si64(src)
Erik de Castro Lopo619d8212014-04-12 07:13:08 +100053#endif
54
55FLAC__SSE_TARGET("sse2")
56unsigned FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
57{
58 FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
59 unsigned i, order;
60
61 __m128i total_err0, total_err1, total_err2;
62
63 {
64 FLAC__int32 itmp;
65 __m128i last_error;
66
67 last_error = _mm_cvtsi32_si128(data[-1]); // 0 0 0 le0
68 itmp = data[-2];
69 last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
70 last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 0 le0 le1
71 itmp -= data[-3];
72 last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
73 last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 le0 le1 le2
74 itmp -= data[-3] - data[-4];
75 last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
76 last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // le0 le1 le2 le3
77
78 total_err0 = total_err1 = _mm_setzero_si128();
79 for(i = 0; i < data_len; i++) {
80 __m128i err0, err1, tmp;
81 err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
82 err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
Erik de Castro Lopo71875b02014-09-21 09:28:36 +100083#if 1 /* OPT_SSE */
Erik de Castro Lopo619d8212014-04-12 07:13:08 +100084 err1 = _mm_sub_epi32(err1, last_error);
85 last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
86 err1 = _mm_sub_epi32(err1, last_error);
87 last_error = _mm_srli_si128(last_error, 4); // 0 0 le0 le1
88 err1 = _mm_sub_epi32(err1, last_error);
89 last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
90 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
Erik de Castro Lopo71875b02014-09-21 09:28:36 +100091#else
92 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
93 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
94 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
95#endif
Erik de Castro Lopo619d8212014-04-12 07:13:08 +100096 tmp = _mm_slli_si128(err0, 12); // e0 0 0 0
Erik de Castro Lopof0a17e92014-10-07 06:34:18 +110097 last_error = _mm_srli_si128(err1, 4); // 0 e1 e2 e3
Erik de Castro Lopo619d8212014-04-12 07:13:08 +100098 last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3
99
100 tmp = _mm_srai_epi32(err0, 31);
101 err0 = _mm_xor_si128(err0, tmp);
102 err0 = _mm_sub_epi32(err0, tmp);
103 tmp = _mm_srai_epi32(err1, 31);
104 err1 = _mm_xor_si128(err1, tmp);
105 err1 = _mm_sub_epi32(err1, tmp);
106
107 total_err0 = _mm_add_epi32(total_err0, err0); // 0 0 0 te0
108 total_err1 = _mm_add_epi32(total_err1, err1); // te1 te2 te3 te4
109 }
110 }
111
112 total_error_0 = _mm_cvtsi128_si32(total_err0);
113 total_err2 = total_err1; // te1 te2 te3 te4
114 total_err1 = _mm_srli_si128(total_err1, 8); // 0 0 te1 te2
115 total_error_4 = _mm_cvtsi128_si32(total_err2);
116 total_error_2 = _mm_cvtsi128_si32(total_err1);
117 total_err2 = _mm_srli_si128(total_err2, 4); // 0 te1 te2 te3
118 total_err1 = _mm_srli_si128(total_err1, 4); // 0 0 0 te1
119 total_error_3 = _mm_cvtsi128_si32(total_err2);
120 total_error_1 = _mm_cvtsi128_si32(total_err1);
121
122 /* prefer higher order */
123 if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
124 order = 0;
125 else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
126 order = 1;
127 else if(total_error_2 < flac_min(total_error_3, total_error_4))
128 order = 2;
129 else if(total_error_3 < total_error_4)
130 order = 3;
131 else
132 order = 4;
133
134 /* Estimate the expected number of bits per residual signal sample. */
135 /* 'total_error*' is linearly related to the variance of the residual */
136 /* signal, so we use it directly to compute E(|x|) */
137 FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
138 FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
139 FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
140 FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
141 FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
142
143 residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
144 residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
145 residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
146 residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
147 residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
148
149 return order;
150}
151
152FLAC__SSE_TARGET("sse2")
153unsigned FLAC__fixed_compute_best_predictor_wide_intrin_sse2(const FLAC__int32 data[], unsigned data_len, FLAC__float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1])
154{
155 FLAC__uint64 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4;
156 unsigned i, order;
157
158 __m128i total_err0, total_err1, total_err3;
159
160 {
161 FLAC__int32 itmp;
162 __m128i last_error, zero = _mm_setzero_si128();
163
164 last_error = _mm_cvtsi32_si128(data[-1]); // 0 0 0 le0
165 itmp = data[-2];
166 last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
167 last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 0 le0 le1
168 itmp -= data[-3];
169 last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
170 last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 le0 le1 le2
171 itmp -= data[-3] - data[-4];
172 last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0));
173 last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // le0 le1 le2 le3
174
175 total_err0 = total_err1 = total_err3 = _mm_setzero_si128();
176 for(i = 0; i < data_len; i++) {
177 __m128i err0, err1, tmp;
178 err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0
179 err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0
Erik de Castro Lopo71875b02014-09-21 09:28:36 +1000180#if 1 /* OPT_SSE */
Erik de Castro Lopo619d8212014-04-12 07:13:08 +1000181 err1 = _mm_sub_epi32(err1, last_error);
182 last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2
183 err1 = _mm_sub_epi32(err1, last_error);
184 last_error = _mm_srli_si128(last_error, 4); // 0 0 le0 le1
185 err1 = _mm_sub_epi32(err1, last_error);
186 last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0
187 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
Erik de Castro Lopo71875b02014-09-21 09:28:36 +1000188#else
189 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1
190 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0
191 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4
192#endif
Erik de Castro Lopo619d8212014-04-12 07:13:08 +1000193 tmp = _mm_slli_si128(err0, 12); // e0 0 0 0
Erik de Castro Lopof0a17e92014-10-07 06:34:18 +1100194 last_error = _mm_srli_si128(err1, 4); // 0 e1 e2 e3
Erik de Castro Lopo619d8212014-04-12 07:13:08 +1000195 last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3
196
197 tmp = _mm_srai_epi32(err0, 31);
198 err0 = _mm_xor_si128(err0, tmp);
199 err0 = _mm_sub_epi32(err0, tmp);
200 tmp = _mm_srai_epi32(err1, 31);
201 err1 = _mm_xor_si128(err1, tmp);
202 err1 = _mm_sub_epi32(err1, tmp);
203
204 total_err0 = _mm_add_epi64(total_err0, err0); // 0 te0
205 err0 = _mm_unpacklo_epi32(err1, zero); // 0 |e3| 0 |e4|
206 err1 = _mm_unpackhi_epi32(err1, zero); // 0 |e1| 0 |e2|
207 total_err3 = _mm_add_epi64(total_err3, err0); // te3 te4
208 total_err1 = _mm_add_epi64(total_err1, err1); // te1 te2
209 }
210 }
211
Erik de Castro Lopo4fa58802014-09-18 21:39:36 +1000212 m128i_to_i64(total_error_0, total_err0);
213 m128i_to_i64(total_error_4, total_err3);
214 m128i_to_i64(total_error_2, total_err1);
Erik de Castro Lopo619d8212014-04-12 07:13:08 +1000215 total_err3 = _mm_srli_si128(total_err3, 8); // 0 te3
216 total_err1 = _mm_srli_si128(total_err1, 8); // 0 te1
Erik de Castro Lopo4fa58802014-09-18 21:39:36 +1000217 m128i_to_i64(total_error_3, total_err3);
218 m128i_to_i64(total_error_1, total_err1);
Erik de Castro Lopo619d8212014-04-12 07:13:08 +1000219
220 /* prefer higher order */
221 if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4))
222 order = 0;
223 else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4))
224 order = 1;
225 else if(total_error_2 < flac_min(total_error_3, total_error_4))
226 order = 2;
227 else if(total_error_3 < total_error_4)
228 order = 3;
229 else
230 order = 4;
231
232 /* Estimate the expected number of bits per residual signal sample. */
233 /* 'total_error*' is linearly related to the variance of the residual */
234 /* signal, so we use it directly to compute E(|x|) */
235 FLAC__ASSERT(data_len > 0 || total_error_0 == 0);
236 FLAC__ASSERT(data_len > 0 || total_error_1 == 0);
237 FLAC__ASSERT(data_len > 0 || total_error_2 == 0);
238 FLAC__ASSERT(data_len > 0 || total_error_3 == 0);
239 FLAC__ASSERT(data_len > 0 || total_error_4 == 0);
240
241 residual_bits_per_sample[0] = (FLAC__float)((total_error_0 > 0) ? log(M_LN2 * (FLAC__double)total_error_0 / (FLAC__double)data_len) / M_LN2 : 0.0);
242 residual_bits_per_sample[1] = (FLAC__float)((total_error_1 > 0) ? log(M_LN2 * (FLAC__double)total_error_1 / (FLAC__double)data_len) / M_LN2 : 0.0);
243 residual_bits_per_sample[2] = (FLAC__float)((total_error_2 > 0) ? log(M_LN2 * (FLAC__double)total_error_2 / (FLAC__double)data_len) / M_LN2 : 0.0);
244 residual_bits_per_sample[3] = (FLAC__float)((total_error_3 > 0) ? log(M_LN2 * (FLAC__double)total_error_3 / (FLAC__double)data_len) / M_LN2 : 0.0);
245 residual_bits_per_sample[4] = (FLAC__float)((total_error_4 > 0) ? log(M_LN2 * (FLAC__double)total_error_4 / (FLAC__double)data_len) / M_LN2 : 0.0);
246
247 return order;
248}
249
250#endif /* FLAC__SSE2_SUPPORTED */
Erik de Castro Lopob8d58e32014-06-15 20:29:34 +1000251#endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
Erik de Castro Lopo619d8212014-04-12 07:13:08 +1000252#endif /* FLAC__NO_ASM */
253#endif /* FLAC__INTEGER_ONLY_LIBRARY */