blob: 72f34fd6fc4f5f2ffc2cc2ed0407c9b60e20a88e [file] [log] [blame]
flimc91ee5b2016-01-26 14:33:44 +01001/* Copyright (c) 2014, Cisco Systems, INC
2 Written by XiangMingZhu WeiZhou MinPeng YanWang
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 - Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 - Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28#ifdef HAVE_CONFIG_H
29#include "config.h"
30#endif
31
32#include <xmmintrin.h>
33#include <emmintrin.h>
34#include <smmintrin.h>
35#include "main.h"
36#include "celt/x86/x86cpu.h"
37#include "stack_alloc.h"
38
39static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
40 const silk_encoder_state *psEncC, /* I Encoder State */
41 silk_nsq_state *NSQ, /* I/O NSQ state */
42 const opus_int32 x_Q3[], /* I input in Q3 */
43 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
44 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
45 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
46 opus_int subfr, /* I subframe number */
47 const opus_int LTP_scale_Q14, /* I */
48 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
49 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
50 const opus_int signal_type /* I Signal type */
51);
52
53static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
54 silk_nsq_state *NSQ, /* I/O NSQ state */
55 opus_int signalType, /* I Signal type */
56 const opus_int32 x_sc_Q10[], /* I */
57 opus_int8 pulses[], /* O */
58 opus_int16 xq[], /* O */
59 opus_int32 sLTP_Q15[], /* I/O LTP state */
60 const opus_int16 a_Q12[], /* I Short term prediction coefs */
61 const opus_int16 b_Q14[], /* I Long term prediction coefs */
62 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
63 opus_int lag, /* I Pitch lag */
64 opus_int32 HarmShapeFIRPacked_Q14, /* I */
65 opus_int Tilt_Q14, /* I Spectral tilt */
66 opus_int32 LF_shp_Q14, /* I */
67 opus_int32 Gain_Q16, /* I */
68 opus_int offset_Q10, /* I */
69 opus_int length, /* I Input length */
70 opus_int32 table[][4] /* I */
71);
72
73void silk_NSQ_sse4_1(
74 const silk_encoder_state *psEncC, /* I/O Encoder State */
75 silk_nsq_state *NSQ, /* I/O NSQ state */
76 SideInfoIndices *psIndices, /* I/O Quantization Indices */
77 const opus_int32 x_Q3[], /* I Prefiltered input signal */
78 opus_int8 pulses[], /* O Quantized pulse signal */
79 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I Short term prediction coefs */
80 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I Long term prediction coefs */
81 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /* I Noise shaping coefs */
82 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I Long term shaping coefs */
83 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I Spectral tilt */
84 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I Low frequency shaping coefs */
85 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I Quantization step sizes */
86 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lags */
87 const opus_int Lambda_Q10, /* I Rate/distortion tradeoff */
88 const opus_int LTP_scale_Q14 /* I LTP state scaling */
89)
90{
91 opus_int k, lag, start_idx, LSF_interpolation_flag;
92 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13;
93 opus_int16 *pxq;
94 VARDECL( opus_int32, sLTP_Q15 );
95 VARDECL( opus_int16, sLTP );
96 opus_int32 HarmShapeFIRPacked_Q14;
97 opus_int offset_Q10;
98 VARDECL( opus_int32, x_sc_Q10 );
99
100 opus_int32 table[ 64 ][ 4 ];
101 opus_int32 tmp1;
102 opus_int32 q1_Q10, q2_Q10, rd1_Q20, rd2_Q20;
103
104 SAVE_STACK;
105
106 NSQ->rand_seed = psIndices->Seed;
107
108 /* Set unvoiced lag to the previous one, overwrite later for voiced */
109 lag = NSQ->lagPrev;
110
111 silk_assert( NSQ->prev_gain_Q16 != 0 );
112
113 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][ psIndices->quantOffsetType ];
114
115 /* 0 */
116 q1_Q10 = offset_Q10;
117 q2_Q10 = offset_Q10 + ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
118 rd1_Q20 = q1_Q10 * Lambda_Q10;
119 rd2_Q20 = q2_Q10 * Lambda_Q10;
120
121 table[ 32 ][ 0 ] = q1_Q10;
122 table[ 32 ][ 1 ] = q2_Q10;
123 table[ 32 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
124 table[ 32 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
125
126 /* -1 */
127 q1_Q10 = offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 );
128 q2_Q10 = offset_Q10;
129 rd1_Q20 = - q1_Q10 * Lambda_Q10;
130 rd2_Q20 = q2_Q10 * Lambda_Q10;
131
132 table[ 31 ][ 0 ] = q1_Q10;
133 table[ 31 ][ 1 ] = q2_Q10;
134 table[ 31 ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
135 table[ 31 ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
136
137 /* > 0 */
138 for (k = 1; k <= 31; k++)
139 {
140 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
141
142 q1_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10;
143 q2_Q10 = tmp1 - QUANT_LEVEL_ADJUST_Q10 + 1024;
144 rd1_Q20 = q1_Q10 * Lambda_Q10;
145 rd2_Q20 = q2_Q10 * Lambda_Q10;
146
147 table[ 32 + k ][ 0 ] = q1_Q10;
148 table[ 32 + k ][ 1 ] = q2_Q10;
149 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
150 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
151 }
152
153 /* < -1 */
154 for (k = -32; k <= -2; k++)
155 {
156 tmp1 = offset_Q10 + silk_LSHIFT( k, 10 );
157
158 q1_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10;
159 q2_Q10 = tmp1 + QUANT_LEVEL_ADJUST_Q10 + 1024;
160 rd1_Q20 = - q1_Q10 * Lambda_Q10;
161 rd2_Q20 = - q2_Q10 * Lambda_Q10;
162
163 table[ 32 + k ][ 0 ] = q1_Q10;
164 table[ 32 + k ][ 1 ] = q2_Q10;
165 table[ 32 + k ][ 2 ] = 2 * (q1_Q10 - q2_Q10);
166 table[ 32 + k ][ 3 ] = (rd1_Q20 - rd2_Q20) + (q1_Q10 * q1_Q10 - q2_Q10 * q2_Q10);
167 }
168
169 if( psIndices->NLSFInterpCoef_Q2 == 4 ) {
170 LSF_interpolation_flag = 0;
171 } else {
172 LSF_interpolation_flag = 1;
173 }
174
175 ALLOC( sLTP_Q15,
176 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 );
177 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 );
178 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 );
179 /* Set up pointers to start of sub frame */
180 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length;
181 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
182 pxq = &NSQ->xq[ psEncC->ltp_mem_length ];
183 for( k = 0; k < psEncC->nb_subfr; k++ ) {
184 A_Q12 = &PredCoef_Q12[ (( k >> 1 ) | ( 1 - LSF_interpolation_flag )) * MAX_LPC_ORDER ];
185 B_Q14 = &LTPCoef_Q14[ k * LTP_ORDER ];
186 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ];
187
188 /* Noise shape parameters */
189 silk_assert( HarmShapeGain_Q14[ k ] >= 0 );
190 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShapeGain_Q14[ k ], 2 );
191 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShapeGain_Q14[ k ], 1 ), 16 );
192
193 NSQ->rewhite_flag = 0;
194 if( psIndices->signalType == TYPE_VOICED ) {
195 /* Voiced */
196 lag = pitchL[ k ];
197
198 /* Re-whitening */
199 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 ) {
200 /* Rewhiten with new A coefs */
201 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrder - LTP_ORDER / 2;
202 silk_assert( start_idx > 0 );
203
204 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_idx + k * psEncC->subfr_length ],
205 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLPCOrder, psEncC->arch );
206
207 NSQ->rewhite_flag = 1;
208 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length;
209 }
210 }
211
212 silk_nsq_scale_states_sse4_1( psEncC, NSQ, x_Q3, x_sc_Q10, sLTP, sLTP_Q15, k, LTP_scale_Q14, Gains_Q16, pitchL, psIndices->signalType );
213
214 if ( opus_likely( ( 10 == psEncC->shapingLPCOrder ) && ( 16 == psEncC->predictLPCOrder) ) )
215 {
216 silk_noise_shape_quantizer_10_16_sse4_1( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
217 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ],
218 offset_Q10, psEncC->subfr_length, &(table[32]) );
219 }
220 else
221 {
222 silk_noise_shape_quantizer( NSQ, psIndices->signalType, x_sc_Q10, pulses, pxq, sLTP_Q15, A_Q12, B_Q14,
223 AR_shp_Q13, lag, HarmShapeFIRPacked_Q14, Tilt_Q14[ k ], LF_shp_Q14[ k ], Gains_Q16[ k ], Lambda_Q10,
224 offset_Q10, psEncC->subfr_length, psEncC->shapingLPCOrder, psEncC->predictLPCOrder );
225 }
226
227 x_Q3 += psEncC->subfr_length;
228 pulses += psEncC->subfr_length;
229 pxq += psEncC->subfr_length;
230 }
231
232 /* Update lagPrev for next frame */
233 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ];
234
235 /* Save quantized speech and noise shaping signals */
236 /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[ psEncC->ltp_mem_length ], psEncC->frame_length * sizeof( opus_int16 ) ) */
237 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int16 ) );
238 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ], psEncC->ltp_mem_length * sizeof( opus_int32 ) );
239 RESTORE_STACK;
240}
241
242/***********************************/
243/* silk_noise_shape_quantizer_10_16 */
244/***********************************/
245static OPUS_INLINE void silk_noise_shape_quantizer_10_16_sse4_1(
246 silk_nsq_state *NSQ, /* I/O NSQ state */
247 opus_int signalType, /* I Signal type */
248 const opus_int32 x_sc_Q10[], /* I */
249 opus_int8 pulses[], /* O */
250 opus_int16 xq[], /* O */
251 opus_int32 sLTP_Q15[], /* I/O LTP state */
252 const opus_int16 a_Q12[], /* I Short term prediction coefs */
253 const opus_int16 b_Q14[], /* I Long term prediction coefs */
254 const opus_int16 AR_shp_Q13[], /* I Noise shaping AR coefs */
255 opus_int lag, /* I Pitch lag */
256 opus_int32 HarmShapeFIRPacked_Q14, /* I */
257 opus_int Tilt_Q14, /* I Spectral tilt */
258 opus_int32 LF_shp_Q14, /* I */
259 opus_int32 Gain_Q16, /* I */
260 opus_int offset_Q10, /* I */
261 opus_int length, /* I Input length */
262 opus_int32 table[][4] /* I */
263)
264{
265 opus_int i;
266 opus_int32 LTP_pred_Q13, LPC_pred_Q10, n_AR_Q12, n_LTP_Q13;
267 opus_int32 n_LF_Q12, r_Q10, q1_Q0, q1_Q10, q2_Q10;
268 opus_int32 exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10;
269 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14;
270 opus_int32 *psLPC_Q14, *shp_lag_ptr, *pred_lag_ptr;
271
272 __m128i xmm_tempa, xmm_tempb;
273
274 __m128i xmm_one;
275
276 __m128i psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF;
277 __m128i psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF;
278 __m128i a_Q12_01234567, a_Q12_89ABCDEF;
279
280 __m128i sAR2_Q14_hi_76543210, sAR2_Q14_lo_76543210;
281 __m128i AR_shp_Q13_76543210;
282
283 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
284 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
285 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 );
286
287 /* Set up short term AR state */
288 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 ];
289
290 sLF_AR_shp_Q14 = NSQ->sLF_AR_shp_Q14;
291 xq_Q14 = psLPC_Q14[ 0 ];
292 LTP_pred_Q13 = 0;
293
294 /* load a_Q12 */
295 xmm_one = _mm_set_epi8( 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 );
296
297 /* load a_Q12[0] - a_Q12[7] */
298 a_Q12_01234567 = _mm_loadu_si128( (__m128i *)(&a_Q12[ 0 ] ) );
299 /* load a_Q12[ 8 ] - a_Q12[ 15 ] */
300 a_Q12_89ABCDEF = _mm_loadu_si128( (__m128i *)(&a_Q12[ 8 ] ) );
301
302 a_Q12_01234567 = _mm_shuffle_epi8( a_Q12_01234567, xmm_one );
303 a_Q12_89ABCDEF = _mm_shuffle_epi8( a_Q12_89ABCDEF, xmm_one );
304
305 /* load AR_shp_Q13 */
306 AR_shp_Q13_76543210 = _mm_loadu_si128( (__m128i *)(&AR_shp_Q13[0] ) );
307
308 /* load psLPC_Q14 */
309 xmm_one = _mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 );
310
311 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-16]) );
312 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[-12]) );
313
314 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
315 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
316
317 psLPC_Q14_hi_89ABCDEF = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
318 psLPC_Q14_lo_89ABCDEF = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
319
320 xmm_tempa = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -8 ]) );
321 xmm_tempb = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -4 ]) );
322
323 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
324 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
325
326 psLPC_Q14_hi_01234567 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
327 psLPC_Q14_lo_01234567 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
328
329 /* load sAR2_Q14 */
330 xmm_tempa = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 0 ]) ) );
331 xmm_tempb = _mm_loadu_si128( (__m128i *)(&(NSQ->sAR2_Q14[ 4 ]) ) );
332
333 xmm_tempa = _mm_shuffle_epi8( xmm_tempa, xmm_one );
334 xmm_tempb = _mm_shuffle_epi8( xmm_tempb, xmm_one );
335
336 sAR2_Q14_hi_76543210 = _mm_unpackhi_epi64( xmm_tempa, xmm_tempb );
337 sAR2_Q14_lo_76543210 = _mm_unpacklo_epi64( xmm_tempa, xmm_tempb );
338
339 /* prepare 1 in 8 * 16bit */
340 xmm_one = _mm_set1_epi16(1);
341
342 for( i = 0; i < length; i++ )
343 {
344 /* Short-term prediction */
345 __m128i xmm_hi_07, xmm_hi_8F, xmm_lo_07, xmm_lo_8F;
346
347 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
348 LPC_pred_Q10 = 8; /* silk_RSHIFT( predictLPCOrder, 1 ); */
349
350 /* shift psLPC_Q14 */
351 psLPC_Q14_hi_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_hi_01234567, psLPC_Q14_hi_89ABCDEF, 2 );
352 psLPC_Q14_lo_89ABCDEF = _mm_alignr_epi8( psLPC_Q14_lo_01234567, psLPC_Q14_lo_89ABCDEF, 2 );
353
354 psLPC_Q14_hi_01234567 = _mm_srli_si128( psLPC_Q14_hi_01234567, 2 );
355 psLPC_Q14_lo_01234567 = _mm_srli_si128( psLPC_Q14_lo_01234567, 2 );
356
357 psLPC_Q14_hi_01234567 = _mm_insert_epi16( psLPC_Q14_hi_01234567, (xq_Q14 >> 16), 7 );
358 psLPC_Q14_lo_01234567 = _mm_insert_epi16( psLPC_Q14_lo_01234567, (xq_Q14), 7 );
359
360 /* high part, use pmaddwd, results in 4 32-bit */
361 xmm_hi_07 = _mm_madd_epi16( psLPC_Q14_hi_01234567, a_Q12_01234567 );
362 xmm_hi_8F = _mm_madd_epi16( psLPC_Q14_hi_89ABCDEF, a_Q12_89ABCDEF );
363
364 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed, _mm_srai_epi16(psLPC_Q14_lo_01234567, 15) */
365 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_01234567 );
366 xmm_tempb = _mm_cmpgt_epi16( _mm_setzero_si128(), psLPC_Q14_lo_89ABCDEF );
367
368 xmm_tempa = _mm_and_si128( xmm_tempa, a_Q12_01234567 );
369 xmm_tempb = _mm_and_si128( xmm_tempb, a_Q12_89ABCDEF );
370
371 xmm_lo_07 = _mm_mulhi_epi16( psLPC_Q14_lo_01234567, a_Q12_01234567 );
372 xmm_lo_8F = _mm_mulhi_epi16( psLPC_Q14_lo_89ABCDEF, a_Q12_89ABCDEF );
373
374 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
375 xmm_lo_8F = _mm_add_epi16( xmm_lo_8F, xmm_tempb );
376
377 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
378 xmm_lo_8F = _mm_madd_epi16( xmm_lo_8F, xmm_one );
379
380 /* accumulate */
381 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_hi_8F );
382 xmm_lo_07 = _mm_add_epi32( xmm_lo_07, xmm_lo_8F );
383
384 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
385
386 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
387 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
388
389 LPC_pred_Q10 += _mm_cvtsi128_si32( xmm_hi_07 );
390
391 /* Long-term prediction */
392 if ( opus_likely( signalType == TYPE_VOICED ) ) {
393 /* Unrolled loop */
394 /* Avoids introducing a bias because silk_SMLAWB() always rounds to -inf */
395 LTP_pred_Q13 = 2;
396 {
397 __m128i b_Q14_3210, b_Q14_0123, pred_lag_ptr_0123;
398
399 b_Q14_3210 = OP_CVTEPI16_EPI32_M64( b_Q14 );
400 b_Q14_0123 = _mm_shuffle_epi32( b_Q14_3210, 0x1B );
401
402 /* loaded: [0] [-1] [-2] [-3] */
403 pred_lag_ptr_0123 = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr[ -3 ] ) );
404 /* shuffle to [-3] [-2] [-1] [0] and to new xmm */
405 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, 0x1B );
406 /*64-bit multiply, a[2] * b[-2], a[0] * b[0] */
407 xmm_tempa = _mm_mul_epi32( xmm_tempa, b_Q14_3210 );
408 /* right shift 2 bytes (16 bits), zero extended */
409 xmm_tempa = _mm_srli_si128( xmm_tempa, 2 );
410
411 /* a[1] * b[-1], a[3] * b[-3] */
412 pred_lag_ptr_0123 = _mm_mul_epi32( pred_lag_ptr_0123, b_Q14_0123 );
413 pred_lag_ptr_0123 = _mm_srli_si128( pred_lag_ptr_0123, 2 );
414
415 pred_lag_ptr_0123 = _mm_add_epi32( pred_lag_ptr_0123, xmm_tempa );
416 /* equal shift right 8 bytes*/
417 xmm_tempa = _mm_shuffle_epi32( pred_lag_ptr_0123, _MM_SHUFFLE( 0, 0, 3, 2 ) );
418 xmm_tempa = _mm_add_epi32( xmm_tempa, pred_lag_ptr_0123 );
419
420 LTP_pred_Q13 += _mm_cvtsi128_si32( xmm_tempa );
421
422 LTP_pred_Q13 = silk_SMLAWB( LTP_pred_Q13, pred_lag_ptr[ -4 ], b_Q14[ 4 ] );
423 pred_lag_ptr++;
424 }
425 }
426
427 /* Noise shape feedback */
428 NSQ->sAR2_Q14[ 9 ] = NSQ->sAR2_Q14[ 8 ];
429 NSQ->sAR2_Q14[ 8 ] = _mm_cvtsi128_si32( _mm_srli_si128(_mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 ), 12 ) );
430
431 sAR2_Q14_hi_76543210 = _mm_slli_si128( sAR2_Q14_hi_76543210, 2 );
432 sAR2_Q14_lo_76543210 = _mm_slli_si128( sAR2_Q14_lo_76543210, 2 );
433
434 sAR2_Q14_hi_76543210 = _mm_insert_epi16( sAR2_Q14_hi_76543210, (xq_Q14 >> 16), 0 );
435 sAR2_Q14_lo_76543210 = _mm_insert_epi16( sAR2_Q14_lo_76543210, (xq_Q14), 0 );
436
437 /* high part, use pmaddwd, results in 4 32-bit */
438 xmm_hi_07 = _mm_madd_epi16( sAR2_Q14_hi_76543210, AR_shp_Q13_76543210 );
439
440 /* low part, use pmulhw, results in 8 16-bit, note we need simulate unsigned * signed,_mm_srai_epi16(sAR2_Q14_lo_76543210, 15) */
441 xmm_tempa = _mm_cmpgt_epi16( _mm_setzero_si128(), sAR2_Q14_lo_76543210 );
442 xmm_tempa = _mm_and_si128( xmm_tempa, AR_shp_Q13_76543210 );
443
444 xmm_lo_07 = _mm_mulhi_epi16( sAR2_Q14_lo_76543210, AR_shp_Q13_76543210 );
445 xmm_lo_07 = _mm_add_epi16( xmm_lo_07, xmm_tempa );
446
447 xmm_lo_07 = _mm_madd_epi16( xmm_lo_07, xmm_one );
448
449 /* accumulate */
450 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, xmm_lo_07 );
451
452 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_unpackhi_epi64(xmm_hi_07, xmm_hi_07 ) );
453 xmm_hi_07 = _mm_add_epi32( xmm_hi_07, _mm_shufflelo_epi16(xmm_hi_07, 0x0E ) );
454
455 n_AR_Q12 = 5 + _mm_cvtsi128_si32( xmm_hi_07 );
456
457 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 8 ], AR_shp_Q13[ 8 ] );
458 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, NSQ->sAR2_Q14[ 9 ], AR_shp_Q13[ 9 ] );
459
460 n_AR_Q12 = silk_LSHIFT32( n_AR_Q12, 1 ); /* Q11 -> Q12 */
461 n_AR_Q12 = silk_SMLAWB( n_AR_Q12, sLF_AR_shp_Q14, Tilt_Q14 );
462
463 n_LF_Q12 = silk_SMULWB( NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - 1 ], LF_shp_Q14 );
464 n_LF_Q12 = silk_SMLAWT( n_LF_Q12, sLF_AR_shp_Q14, LF_shp_Q14 );
465
466 silk_assert( lag > 0 || signalType != TYPE_VOICED );
467
468 /* Combine prediction and noise shaping signals */
469 tmp1 = silk_SUB32( silk_LSHIFT32( LPC_pred_Q10, 2 ), n_AR_Q12 ); /* Q12 */
470 tmp1 = silk_SUB32( tmp1, n_LF_Q12 ); /* Q12 */
471 if( lag > 0 ) {
472 /* Symmetric, packed FIR coefficients */
473 n_LTP_Q13 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[ -2 ] ), HarmShapeFIRPacked_Q14 );
474 n_LTP_Q13 = silk_SMLAWT( n_LTP_Q13, shp_lag_ptr[ -1 ], HarmShapeFIRPacked_Q14 );
475 n_LTP_Q13 = silk_LSHIFT( n_LTP_Q13, 1 );
476 shp_lag_ptr++;
477
478 tmp2 = silk_SUB32( LTP_pred_Q13, n_LTP_Q13 ); /* Q13 */
479 tmp1 = silk_ADD_LSHIFT32( tmp2, tmp1, 1 ); /* Q13 */
480 tmp1 = silk_RSHIFT_ROUND( tmp1, 3 ); /* Q10 */
481 } else {
482 tmp1 = silk_RSHIFT_ROUND( tmp1, 2 ); /* Q10 */
483 }
484
485 r_Q10 = silk_SUB32( x_sc_Q10[ i ], tmp1 ); /* residual error Q10 */
486
487 /* Generate dither */
488 NSQ->rand_seed = silk_RAND( NSQ->rand_seed );
489
490 /* Flip sign depending on dither */
491 tmp2 = -r_Q10;
492 if ( NSQ->rand_seed < 0 ) r_Q10 = tmp2;
493
494 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 );
495
496 /* Find two quantization level candidates and measure their rate-distortion */
497 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 );
498 q1_Q0 = silk_RSHIFT( q1_Q10, 10 );
499
500 q1_Q10 = table[q1_Q0][0];
501 q2_Q10 = table[q1_Q0][1];
502
503 if (r_Q10 * table[q1_Q0][2] - table[q1_Q0][3] < 0)
504 {
505 q1_Q10 = q2_Q10;
506 }
507
508 pulses[ i ] = (opus_int8)silk_RSHIFT_ROUND( q1_Q10, 10 );
509
510 /* Excitation */
511 exc_Q14 = silk_LSHIFT( q1_Q10, 4 );
512
513 tmp2 = -exc_Q14;
514 if ( NSQ->rand_seed < 0 ) exc_Q14 = tmp2;
515
516 /* Add predictions */
517 LPC_exc_Q14 = silk_ADD_LSHIFT32( exc_Q14, LTP_pred_Q13, 1 );
518 xq_Q14 = silk_ADD_LSHIFT32( LPC_exc_Q14, LPC_pred_Q10, 4 );
519
520 /* Update states */
521 psLPC_Q14++;
522 *psLPC_Q14 = xq_Q14;
523 sLF_AR_shp_Q14 = silk_SUB_LSHIFT32( xq_Q14, n_AR_Q12, 2 );
524
525 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx ] = silk_SUB_LSHIFT32( sLF_AR_shp_Q14, n_LF_Q12, 2 );
526 sLTP_Q15[ NSQ->sLTP_buf_idx ] = silk_LSHIFT( LPC_exc_Q14, 1 );
527 NSQ->sLTP_shp_buf_idx++;
528 NSQ->sLTP_buf_idx++;
529
530 /* Make dither dependent on quantized signal */
531 NSQ->rand_seed = silk_ADD32_ovflw( NSQ->rand_seed, pulses[ i ] );
532 }
533
534 NSQ->sLF_AR_shp_Q14 = sLF_AR_shp_Q14;
535
536 /* Scale XQ back to normal level before saving */
537 psLPC_Q14 = &NSQ->sLPC_Q14[ NSQ_LPC_BUF_LENGTH ];
538
539 /* write back sAR2_Q14 */
540 xmm_tempa = _mm_unpackhi_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
541 xmm_tempb = _mm_unpacklo_epi16( sAR2_Q14_lo_76543210, sAR2_Q14_hi_76543210 );
542 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 4 ]), xmm_tempa );
543 _mm_storeu_si128( (__m128i *)(&NSQ->sAR2_Q14[ 0 ]), xmm_tempb );
544
545 /* xq[ i ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) ); */
546 {
547 __m128i xmm_Gain_Q10;
548 __m128i xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, xmm_xq_Q14_7654, xmm_xq_Q14_x7x5;
549
550 /* prepare (1 << 7) in packed 4 32-bits */
551 xmm_tempa = _mm_set1_epi32( (1 << 7) );
552
553 /* prepare Gain_Q10 in packed 4 32-bits */
554 xmm_Gain_Q10 = _mm_set1_epi32( Gain_Q10 );
555
556 /* process xq */
557 for (i = 0; i < length - 7; i += 8)
558 {
559 xmm_xq_Q14_3210 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 0 ] ) ) );
560 xmm_xq_Q14_7654 = _mm_loadu_si128( (__m128i *)(&(psLPC_Q14[ i + 4 ] ) ) );
561
562 /* equal shift right 4 bytes*/
563 xmm_xq_Q14_x3x1 = _mm_shuffle_epi32( xmm_xq_Q14_3210, _MM_SHUFFLE( 0, 3, 2, 1 ) );
564 /* equal shift right 4 bytes*/
565 xmm_xq_Q14_x7x5 = _mm_shuffle_epi32( xmm_xq_Q14_7654, _MM_SHUFFLE( 0, 3, 2, 1 ) );
566
567 xmm_xq_Q14_3210 = _mm_mul_epi32( xmm_xq_Q14_3210, xmm_Gain_Q10 );
568 xmm_xq_Q14_x3x1 = _mm_mul_epi32( xmm_xq_Q14_x3x1, xmm_Gain_Q10 );
569 xmm_xq_Q14_7654 = _mm_mul_epi32( xmm_xq_Q14_7654, xmm_Gain_Q10 );
570 xmm_xq_Q14_x7x5 = _mm_mul_epi32( xmm_xq_Q14_x7x5, xmm_Gain_Q10 );
571
572 xmm_xq_Q14_3210 = _mm_srli_epi64( xmm_xq_Q14_3210, 16 );
573 xmm_xq_Q14_x3x1 = _mm_slli_epi64( xmm_xq_Q14_x3x1, 16 );
574 xmm_xq_Q14_7654 = _mm_srli_epi64( xmm_xq_Q14_7654, 16 );
575 xmm_xq_Q14_x7x5 = _mm_slli_epi64( xmm_xq_Q14_x7x5, 16 );
576
577 xmm_xq_Q14_3210 = _mm_blend_epi16( xmm_xq_Q14_3210, xmm_xq_Q14_x3x1, 0xCC );
578 xmm_xq_Q14_7654 = _mm_blend_epi16( xmm_xq_Q14_7654, xmm_xq_Q14_x7x5, 0xCC );
579
580 /* silk_RSHIFT_ROUND(xq, 8) */
581 xmm_xq_Q14_3210 = _mm_add_epi32( xmm_xq_Q14_3210, xmm_tempa );
582 xmm_xq_Q14_7654 = _mm_add_epi32( xmm_xq_Q14_7654, xmm_tempa );
583
584 xmm_xq_Q14_3210 = _mm_srai_epi32( xmm_xq_Q14_3210, 8 );
585 xmm_xq_Q14_7654 = _mm_srai_epi32( xmm_xq_Q14_7654, 8 );
586
587 /* silk_SAT16 */
588 xmm_xq_Q14_3210 = _mm_packs_epi32( xmm_xq_Q14_3210, xmm_xq_Q14_7654 );
589
590 /* save to xq */
591 _mm_storeu_si128( (__m128i *)(&xq[ i ] ), xmm_xq_Q14_3210 );
592 }
593 }
594 for ( ; i < length; i++)
595 {
596 xq[i] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( silk_SMULWW( psLPC_Q14[ i ], Gain_Q10 ), 8 ) );
597 }
598
599 /* Update LPC synth buffer */
600 silk_memcpy( NSQ->sLPC_Q14, &NSQ->sLPC_Q14[ length ], NSQ_LPC_BUF_LENGTH * sizeof( opus_int32 ) );
601}
602
603static OPUS_INLINE void silk_nsq_scale_states_sse4_1(
604 const silk_encoder_state *psEncC, /* I Encoder State */
605 silk_nsq_state *NSQ, /* I/O NSQ state */
606 const opus_int32 x_Q3[], /* I input in Q3 */
607 opus_int32 x_sc_Q10[], /* O input scaled with 1/Gain */
608 const opus_int16 sLTP[], /* I re-whitened LTP state in Q0 */
609 opus_int32 sLTP_Q15[], /* O LTP state matching scaled input */
610 opus_int subfr, /* I subframe number */
611 const opus_int LTP_scale_Q14, /* I */
612 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I */
613 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag */
614 const opus_int signal_type /* I Signal type */
615)
616{
617 opus_int i, lag;
618 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23;
619 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1;
620
621 lag = pitchL[ subfr ];
622 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 );
623 silk_assert( inv_gain_Q31 != 0 );
624
625 /* Calculate gain adjustment factor */
626 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) {
627 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ], 16 );
628 } else {
629 gain_adj_Q16 = (opus_int32)1 << 16;
630 }
631
632 /* Scale input */
633 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 );
634
635 /* prepare inv_gain_Q23 in packed 4 32-bits */
636 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23);
637
638 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) {
639 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) );
640
641 /* equal shift right 4 bytes*/
642 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
643
644 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 );
645 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 );
646
647 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 );
648 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 );
649
650 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC );
651
652 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ] ) ), xmm_x_Q3_x2x0 );
653 }
654
655 for( ; i < psEncC->subfr_length; i++ ) {
656 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 );
657 }
658
659 /* Save inverse gain */
660 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ];
661
662 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16 */
663 if( NSQ->rewhite_flag ) {
664 if( subfr == 0 ) {
665 /* Do LTP downscaling */
666 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14 ), 2 );
667 }
668 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
669 silk_assert( i < MAX_FRAME_LENGTH );
670 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] );
671 }
672 }
673
674 /* Adjust for changing gain */
675 if( gain_adj_Q16 != (opus_int32)1 << 16 ) {
676 /* Scale long-term shaping state */
677 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1;
678
679 /* prepare gain_adj_Q16 in packed 4 32-bits */
680 xmm_gain_adj_Q16 = _mm_set1_epi32(gain_adj_Q16);
681
682 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sLTP_shp_buf_idx - 3; i += 4 )
683 {
684 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ) );
685 /* equal shift right 4 bytes*/
686 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0, _MM_SHUFFLE( 0, 3, 2, 1 ) );
687
688 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xmm_gain_adj_Q16 );
689 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xmm_gain_adj_Q16 );
690
691 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 16 );
692 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 16 );
693
694 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3x1, 0xCC );
695
696 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_sLTP_shp_Q14_x2x0 );
697 }
698
699 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) {
700 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_shp_Q14[ i ] );
701 }
702
703 /* Scale long-term prediction state */
704 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) {
705 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx; i++ ) {
706 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] );
707 }
708 }
709
710 NSQ->sLF_AR_shp_Q14 = silk_SMULWW( gain_adj_Q16, NSQ->sLF_AR_shp_Q14 );
711
712 /* Scale short-term prediction and shaping states */
713 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) {
714 NSQ->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLPC_Q14[ i ] );
715 }
716 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) {
717 NSQ->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sAR2_Q14[ i ] );
718 }
719 }
720}