Blame - libFLAC/lpc_intrin_avx2.c - platform/external/flac

blob: f9f5ccdb02323950aaf174f2274eb83ceea836f6 [file] [log] [blame]

Elliott Hughes	ae0e7bc	2018-01-12 14:46:04 -0800	[diff] [blame]	1	/* libFLAC - Free Lossless Audio Codec library
				2	* Copyright (C) 2000-2009 Josh Coalson
				3	* Copyright (C) 2011-2016 Xiph.Org Foundation
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	*
				9	* - Redistributions of source code must retain the above copyright
				10	* notice, this list of conditions and the following disclaimer.
				11	*
				12	* - Redistributions in binary form must reproduce the above copyright
				13	* notice, this list of conditions and the following disclaimer in the
				14	* documentation and/or other materials provided with the distribution.
				15	*
				16	* - Neither the name of the Xiph.org Foundation nor the names of its
				17	* contributors may be used to endorse or promote products derived from
				18	* this software without specific prior written permission.
				19	*
				20	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				21	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				22	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				23	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
				24	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				25	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				26	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				27	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				28	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				29	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				30	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				31	*/
				32
				33	#ifdef HAVE_CONFIG_H
				34	# include <config.h>
				35	#endif
				36
				37	#include "private/cpu.h"
				38
				39	#ifndef FLAC__INTEGER_ONLY_LIBRARY
				40	#ifndef FLAC__NO_ASM
				41	#if (defined FLAC__CPU_IA32 \|\| defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
				42	#include "private/lpc.h"
				43	#ifdef FLAC__AVX2_SUPPORTED
				44
				45	#include "FLAC/assert.h"
				46	#include "FLAC/format.h"
				47
				48	#include <immintrin.h> /* AVX2 */
				49
				50	FLAC__SSE_TARGET("avx2")
				51	void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
				52	{
				53	int i;
				54	FLAC__int32 sum;
				55	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
				56
				57	FLAC__ASSERT(order > 0);
				58	FLAC__ASSERT(order <= 32);
				59
				60	if(order <= 12) {
				61	if(order > 8) {
				62	if(order > 10) {
				63	if(order == 12) {
				64	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
				65	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				66	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				67	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				68	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				69	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				70	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				71	q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
				72	q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
				73	q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
				74	q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
				75	q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
				76	q11 = _mm256_set1_epi32(0xffff & qlp_coeff[11]);
				77
				78	for(i = 0; i < (int)data_len-7; i+=8) {
				79	__m256i summ, mull;
				80	summ = _mm256_madd_epi16(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
				81	mull = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
				82	mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
				83	mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
				84	mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
				85	mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
				86	mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
				87	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				88	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				89	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				90	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				91	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				92	summ = _mm256_sra_epi32(summ, cnt);
				93	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				94	}
				95	}
				96	else { /* order == 11 */
				97	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
				98	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				99	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				100	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				101	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				102	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				103	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				104	q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
				105	q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
				106	q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
				107	q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
				108	q10 = _mm256_set1_epi32(0xffff & qlp_coeff[10]);
				109
				110	for(i = 0; i < (int)data_len-7; i+=8) {
				111	__m256i summ, mull;
				112	summ = _mm256_madd_epi16(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
				113	mull = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
				114	mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
				115	mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
				116	mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
				117	mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
				118	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				119	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				120	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				121	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				122	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				123	summ = _mm256_sra_epi32(summ, cnt);
				124	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				125	}
				126	}
				127	}
				128	else {
				129	if(order == 10) {
				130	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
				131	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				132	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				133	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				134	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				135	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				136	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				137	q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
				138	q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
				139	q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
				140	q9 = _mm256_set1_epi32(0xffff & qlp_coeff[9 ]);
				141
				142	for(i = 0; i < (int)data_len-7; i+=8) {
				143	__m256i summ, mull;
				144	summ = _mm256_madd_epi16(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
				145	mull = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mull);
				146	mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
				147	mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
				148	mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
				149	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				150	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				151	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				152	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				153	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				154	summ = _mm256_sra_epi32(summ, cnt);
				155	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				156	}
				157	}
				158	else { /* order == 9 */
				159	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
				160	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				161	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				162	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				163	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				164	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				165	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				166	q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
				167	q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
				168	q8 = _mm256_set1_epi32(0xffff & qlp_coeff[8 ]);
				169
				170	for(i = 0; i < (int)data_len-7; i+=8) {
				171	__m256i summ, mull;
				172	summ = _mm256_madd_epi16(q8, _mm256_loadu_si256((const __m256i*)(data+i-9 )));
				173	mull = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mull);
				174	mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
				175	mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
				176	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				177	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				178	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				179	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				180	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				181	summ = _mm256_sra_epi32(summ, cnt);
				182	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				183	}
				184	}
				185	}
				186	}
				187	else if(order > 4) {
				188	if(order > 6) {
				189	if(order == 8) {
				190	__m256i q0, q1, q2, q3, q4, q5, q6, q7;
				191	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				192	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				193	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				194	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				195	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				196	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				197	q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
				198	q7 = _mm256_set1_epi32(0xffff & qlp_coeff[7 ]);
				199
				200	for(i = 0; i < (int)data_len-7; i+=8) {
				201	__m256i summ, mull;
				202	summ = _mm256_madd_epi16(q7, _mm256_loadu_si256((const __m256i*)(data+i-8 )));
				203	mull = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mull);
				204	mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
				205	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				206	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				207	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				208	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				209	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				210	summ = _mm256_sra_epi32(summ, cnt);
				211	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				212	}
				213	}
				214	else { /* order == 7 */
				215	__m256i q0, q1, q2, q3, q4, q5, q6;
				216	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				217	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				218	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				219	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				220	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				221	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				222	q6 = _mm256_set1_epi32(0xffff & qlp_coeff[6 ]);
				223
				224	for(i = 0; i < (int)data_len-7; i+=8) {
				225	__m256i summ, mull;
				226	summ = _mm256_madd_epi16(q6, _mm256_loadu_si256((const __m256i*)(data+i-7 )));
				227	mull = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mull);
				228	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				229	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				230	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				231	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				232	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				233	summ = _mm256_sra_epi32(summ, cnt);
				234	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				235	}
				236	}
				237	}
				238	else {
				239	if(order == 6) {
				240	__m256i q0, q1, q2, q3, q4, q5;
				241	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				242	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				243	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				244	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				245	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				246	q5 = _mm256_set1_epi32(0xffff & qlp_coeff[5 ]);
				247
				248	for(i = 0; i < (int)data_len-7; i+=8) {
				249	__m256i summ, mull;
				250	summ = _mm256_madd_epi16(q5, _mm256_loadu_si256((const __m256i*)(data+i-6 )));
				251	mull = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mull);
				252	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				253	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				254	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				255	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				256	summ = _mm256_sra_epi32(summ, cnt);
				257	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				258	}
				259	}
				260	else { /* order == 5 */
				261	__m256i q0, q1, q2, q3, q4;
				262	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				263	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				264	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				265	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				266	q4 = _mm256_set1_epi32(0xffff & qlp_coeff[4 ]);
				267
				268	for(i = 0; i < (int)data_len-7; i+=8) {
				269	__m256i summ, mull;
				270	summ = _mm256_madd_epi16(q4, _mm256_loadu_si256((const __m256i*)(data+i-5 )));
				271	mull = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mull);
				272	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				273	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				274	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				275	summ = _mm256_sra_epi32(summ, cnt);
				276	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				277	}
				278	}
				279	}
				280	}
				281	else {
				282	if(order > 2) {
				283	if(order == 4) {
				284	__m256i q0, q1, q2, q3;
				285	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				286	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				287	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				288	q3 = _mm256_set1_epi32(0xffff & qlp_coeff[3 ]);
				289
				290	for(i = 0; i < (int)data_len-7; i+=8) {
				291	__m256i summ, mull;
				292	summ = _mm256_madd_epi16(q3, _mm256_loadu_si256((const __m256i*)(data+i-4 )));
				293	mull = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mull);
				294	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				295	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				296	summ = _mm256_sra_epi32(summ, cnt);
				297	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				298	}
				299	}
				300	else { /* order == 3 */
				301	__m256i q0, q1, q2;
				302	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				303	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				304	q2 = _mm256_set1_epi32(0xffff & qlp_coeff[2 ]);
				305
				306	for(i = 0; i < (int)data_len-7; i+=8) {
				307	__m256i summ, mull;
				308	summ = _mm256_madd_epi16(q2, _mm256_loadu_si256((const __m256i*)(data+i-3 )));
				309	mull = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mull);
				310	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				311	summ = _mm256_sra_epi32(summ, cnt);
				312	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				313	}
				314	}
				315	}
				316	else {
				317	if(order == 2) {
				318	__m256i q0, q1;
				319	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				320	q1 = _mm256_set1_epi32(0xffff & qlp_coeff[1 ]);
				321
				322	for(i = 0; i < (int)data_len-7; i+=8) {
				323	__m256i summ, mull;
				324	summ = _mm256_madd_epi16(q1, _mm256_loadu_si256((const __m256i*)(data+i-2 )));
				325	mull = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mull);
				326	summ = _mm256_sra_epi32(summ, cnt);
				327	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				328	}
				329	}
				330	else { /* order == 1 */
				331	__m256i q0;
				332	q0 = _mm256_set1_epi32(0xffff & qlp_coeff[0 ]);
				333
				334	for(i = 0; i < (int)data_len-7; i+=8) {
				335	__m256i summ;
				336	summ = _mm256_madd_epi16(q0, _mm256_loadu_si256((const __m256i*)(data+i-1 )));
				337	summ = _mm256_sra_epi32(summ, cnt);
				338	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				339	}
				340	}
				341	}
				342	}
				343	for(; i < (int)data_len; i++) {
				344	sum = 0;
				345	switch(order) {
				346	case 12: sum += qlp_coeff[11] * data[i-12];
				347	case 11: sum += qlp_coeff[10] * data[i-11];
				348	case 10: sum += qlp_coeff[ 9] * data[i-10];
				349	case 9: sum += qlp_coeff[ 8] * data[i- 9];
				350	case 8: sum += qlp_coeff[ 7] * data[i- 8];
				351	case 7: sum += qlp_coeff[ 6] * data[i- 7];
				352	case 6: sum += qlp_coeff[ 5] * data[i- 6];
				353	case 5: sum += qlp_coeff[ 4] * data[i- 5];
				354	case 4: sum += qlp_coeff[ 3] * data[i- 4];
				355	case 3: sum += qlp_coeff[ 2] * data[i- 3];
				356	case 2: sum += qlp_coeff[ 1] * data[i- 2];
				357	case 1: sum += qlp_coeff[ 0] * data[i- 1];
				358	}
				359	residual[i] = data[i] - (sum >> lp_quantization);
				360	}
				361	}
				362	else { /* order > 12 */
				363	for(i = 0; i < (int)data_len; i++) {
				364	sum = 0;
				365	switch(order) {
				366	case 32: sum += qlp_coeff[31] * data[i-32];
				367	case 31: sum += qlp_coeff[30] * data[i-31];
				368	case 30: sum += qlp_coeff[29] * data[i-30];
				369	case 29: sum += qlp_coeff[28] * data[i-29];
				370	case 28: sum += qlp_coeff[27] * data[i-28];
				371	case 27: sum += qlp_coeff[26] * data[i-27];
				372	case 26: sum += qlp_coeff[25] * data[i-26];
				373	case 25: sum += qlp_coeff[24] * data[i-25];
				374	case 24: sum += qlp_coeff[23] * data[i-24];
				375	case 23: sum += qlp_coeff[22] * data[i-23];
				376	case 22: sum += qlp_coeff[21] * data[i-22];
				377	case 21: sum += qlp_coeff[20] * data[i-21];
				378	case 20: sum += qlp_coeff[19] * data[i-20];
				379	case 19: sum += qlp_coeff[18] * data[i-19];
				380	case 18: sum += qlp_coeff[17] * data[i-18];
				381	case 17: sum += qlp_coeff[16] * data[i-17];
				382	case 16: sum += qlp_coeff[15] * data[i-16];
				383	case 15: sum += qlp_coeff[14] * data[i-15];
				384	case 14: sum += qlp_coeff[13] * data[i-14];
				385	case 13: sum += qlp_coeff[12] * data[i-13];
				386	sum += qlp_coeff[11] * data[i-12];
				387	sum += qlp_coeff[10] * data[i-11];
				388	sum += qlp_coeff[ 9] * data[i-10];
				389	sum += qlp_coeff[ 8] * data[i- 9];
				390	sum += qlp_coeff[ 7] * data[i- 8];
				391	sum += qlp_coeff[ 6] * data[i- 7];
				392	sum += qlp_coeff[ 5] * data[i- 6];
				393	sum += qlp_coeff[ 4] * data[i- 5];
				394	sum += qlp_coeff[ 3] * data[i- 4];
				395	sum += qlp_coeff[ 2] * data[i- 3];
				396	sum += qlp_coeff[ 1] * data[i- 2];
				397	sum += qlp_coeff[ 0] * data[i- 1];
				398	}
				399	residual[i] = data[i] - (sum >> lp_quantization);
				400	}
				401	}
				402	_mm256_zeroupper();
				403	}
				404
				405	FLAC__SSE_TARGET("avx2")
				406	void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
				407	{
				408	int i;
				409	FLAC__int32 sum;
				410	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
				411
				412	FLAC__ASSERT(order > 0);
				413	FLAC__ASSERT(order <= 32);
				414
				415	if(order <= 12) {
				416	if(order > 8) {
				417	if(order > 10) {
				418	if(order == 12) {
				419	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
				420	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				421	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				422	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				423	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				424	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				425	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				426	q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
				427	q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
				428	q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
				429	q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
				430	q10 = _mm256_set1_epi32(qlp_coeff[10]);
				431	q11 = _mm256_set1_epi32(qlp_coeff[11]);
				432
				433	for(i = 0; i < (int)data_len-7; i+=8) {
				434	__m256i summ, mull;
				435	summ = _mm256_mullo_epi32(q11, _mm256_loadu_si256((const __m256i*)(data+i-12)));
				436	mull = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mull);
				437	mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
				438	mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
				439	mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
				440	mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
				441	mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
				442	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				443	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				444	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				445	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				446	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				447	summ = _mm256_sra_epi32(summ, cnt);
				448	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				449	}
				450	}
				451	else { /* order == 11 */
				452	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
				453	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				454	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				455	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				456	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				457	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				458	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				459	q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
				460	q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
				461	q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
				462	q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
				463	q10 = _mm256_set1_epi32(qlp_coeff[10]);
				464
				465	for(i = 0; i < (int)data_len-7; i+=8) {
				466	__m256i summ, mull;
				467	summ = _mm256_mullo_epi32(q10, _mm256_loadu_si256((const __m256i*)(data+i-11)));
				468	mull = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mull);
				469	mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
				470	mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
				471	mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
				472	mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
				473	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				474	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				475	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				476	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				477	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				478	summ = _mm256_sra_epi32(summ, cnt);
				479	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				480	}
				481	}
				482	}
				483	else {
				484	if(order == 10) {
				485	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
				486	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				487	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				488	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				489	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				490	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				491	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				492	q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
				493	q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
				494	q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
				495	q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
				496
				497	for(i = 0; i < (int)data_len-7; i+=8) {
				498	__m256i summ, mull;
				499	summ = _mm256_mullo_epi32(q9, _mm256_loadu_si256((const __m256i*)(data+i-10)));
				500	mull = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, mull);
				501	mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
				502	mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
				503	mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
				504	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				505	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				506	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				507	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				508	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				509	summ = _mm256_sra_epi32(summ, cnt);
				510	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				511	}
				512	}
				513	else { /* order == 9 */
				514	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
				515	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				516	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				517	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				518	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				519	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				520	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				521	q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
				522	q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
				523	q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
				524
				525	for(i = 0; i < (int)data_len-7; i+=8) {
				526	__m256i summ, mull;
				527	summ = _mm256_mullo_epi32(q8, _mm256_loadu_si256((const __m256i*)(data+i-9)));
				528	mull = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, mull);
				529	mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
				530	mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
				531	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				532	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				533	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				534	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				535	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				536	summ = _mm256_sra_epi32(summ, cnt);
				537	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				538	}
				539	}
				540	}
				541	}
				542	else if(order > 4) {
				543	if(order > 6) {
				544	if(order == 8) {
				545	__m256i q0, q1, q2, q3, q4, q5, q6, q7;
				546	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				547	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				548	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				549	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				550	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				551	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				552	q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
				553	q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
				554
				555	for(i = 0; i < (int)data_len-7; i+=8) {
				556	__m256i summ, mull;
				557	summ = _mm256_mullo_epi32(q7, _mm256_loadu_si256((const __m256i*)(data+i-8)));
				558	mull = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, mull);
				559	mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
				560	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				561	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				562	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				563	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				564	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				565	summ = _mm256_sra_epi32(summ, cnt);
				566	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				567	}
				568	}
				569	else { /* order == 7 */
				570	__m256i q0, q1, q2, q3, q4, q5, q6;
				571	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				572	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				573	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				574	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				575	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				576	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				577	q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
				578
				579	for(i = 0; i < (int)data_len-7; i+=8) {
				580	__m256i summ, mull;
				581	summ = _mm256_mullo_epi32(q6, _mm256_loadu_si256((const __m256i*)(data+i-7)));
				582	mull = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, mull);
				583	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				584	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				585	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				586	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				587	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				588	summ = _mm256_sra_epi32(summ, cnt);
				589	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				590	}
				591	}
				592	}
				593	else {
				594	if(order == 6) {
				595	__m256i q0, q1, q2, q3, q4, q5;
				596	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				597	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				598	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				599	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				600	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				601	q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
				602
				603	for(i = 0; i < (int)data_len-7; i+=8) {
				604	__m256i summ, mull;
				605	summ = _mm256_mullo_epi32(q5, _mm256_loadu_si256((const __m256i*)(data+i-6)));
				606	mull = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, mull);
				607	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				608	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				609	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				610	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				611	summ = _mm256_sra_epi32(summ, cnt);
				612	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				613	}
				614	}
				615	else { /* order == 5 */
				616	__m256i q0, q1, q2, q3, q4;
				617	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				618	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				619	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				620	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				621	q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
				622
				623	for(i = 0; i < (int)data_len-7; i+=8) {
				624	__m256i summ, mull;
				625	summ = _mm256_mullo_epi32(q4, _mm256_loadu_si256((const __m256i*)(data+i-5)));
				626	mull = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, mull);
				627	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				628	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				629	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				630	summ = _mm256_sra_epi32(summ, cnt);
				631	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				632	}
				633	}
				634	}
				635	}
				636	else {
				637	if(order > 2) {
				638	if(order == 4) {
				639	__m256i q0, q1, q2, q3;
				640	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				641	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				642	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				643	q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
				644
				645	for(i = 0; i < (int)data_len-7; i+=8) {
				646	__m256i summ, mull;
				647	summ = _mm256_mullo_epi32(q3, _mm256_loadu_si256((const __m256i*)(data+i-4)));
				648	mull = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, mull);
				649	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				650	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				651	summ = _mm256_sra_epi32(summ, cnt);
				652	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				653	}
				654	}
				655	else { /* order == 3 */
				656	__m256i q0, q1, q2;
				657	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				658	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				659	q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
				660
				661	for(i = 0; i < (int)data_len-7; i+=8) {
				662	__m256i summ, mull;
				663	summ = _mm256_mullo_epi32(q2, _mm256_loadu_si256((const __m256i*)(data+i-3)));
				664	mull = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, mull);
				665	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				666	summ = _mm256_sra_epi32(summ, cnt);
				667	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				668	}
				669	}
				670	}
				671	else {
				672	if(order == 2) {
				673	__m256i q0, q1;
				674	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				675	q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
				676
				677	for(i = 0; i < (int)data_len-7; i+=8) {
				678	__m256i summ, mull;
				679	summ = _mm256_mullo_epi32(q1, _mm256_loadu_si256((const __m256i*)(data+i-2)));
				680	mull = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, mull);
				681	summ = _mm256_sra_epi32(summ, cnt);
				682	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				683	}
				684	}
				685	else { /* order == 1 */
				686	__m256i q0;
				687	q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
				688
				689	for(i = 0; i < (int)data_len-7; i+=8) {
				690	__m256i summ;
				691	summ = _mm256_mullo_epi32(q0, _mm256_loadu_si256((const __m256i*)(data+i-1)));
				692	summ = _mm256_sra_epi32(summ, cnt);
				693	_mm256_storeu_si256((__m256i)(residual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ));
				694	}
				695	}
				696	}
				697	}
				698	for(; i < (int)data_len; i++) {
				699	sum = 0;
				700	switch(order) {
				701	case 12: sum += qlp_coeff[11] * data[i-12];
				702	case 11: sum += qlp_coeff[10] * data[i-11];
				703	case 10: sum += qlp_coeff[ 9] * data[i-10];
				704	case 9: sum += qlp_coeff[ 8] * data[i- 9];
				705	case 8: sum += qlp_coeff[ 7] * data[i- 8];
				706	case 7: sum += qlp_coeff[ 6] * data[i- 7];
				707	case 6: sum += qlp_coeff[ 5] * data[i- 6];
				708	case 5: sum += qlp_coeff[ 4] * data[i- 5];
				709	case 4: sum += qlp_coeff[ 3] * data[i- 4];
				710	case 3: sum += qlp_coeff[ 2] * data[i- 3];
				711	case 2: sum += qlp_coeff[ 1] * data[i- 2];
				712	case 1: sum += qlp_coeff[ 0] * data[i- 1];
				713	}
				714	residual[i] = data[i] - (sum >> lp_quantization);
				715	}
				716	}
				717	else { /* order > 12 */
				718	for(i = 0; i < (int)data_len; i++) {
				719	sum = 0;
				720	switch(order) {
				721	case 32: sum += qlp_coeff[31] * data[i-32];
				722	case 31: sum += qlp_coeff[30] * data[i-31];
				723	case 30: sum += qlp_coeff[29] * data[i-30];
				724	case 29: sum += qlp_coeff[28] * data[i-29];
				725	case 28: sum += qlp_coeff[27] * data[i-28];
				726	case 27: sum += qlp_coeff[26] * data[i-27];
				727	case 26: sum += qlp_coeff[25] * data[i-26];
				728	case 25: sum += qlp_coeff[24] * data[i-25];
				729	case 24: sum += qlp_coeff[23] * data[i-24];
				730	case 23: sum += qlp_coeff[22] * data[i-23];
				731	case 22: sum += qlp_coeff[21] * data[i-22];
				732	case 21: sum += qlp_coeff[20] * data[i-21];
				733	case 20: sum += qlp_coeff[19] * data[i-20];
				734	case 19: sum += qlp_coeff[18] * data[i-19];
				735	case 18: sum += qlp_coeff[17] * data[i-18];
				736	case 17: sum += qlp_coeff[16] * data[i-17];
				737	case 16: sum += qlp_coeff[15] * data[i-16];
				738	case 15: sum += qlp_coeff[14] * data[i-15];
				739	case 14: sum += qlp_coeff[13] * data[i-14];
				740	case 13: sum += qlp_coeff[12] * data[i-13];
				741	sum += qlp_coeff[11] * data[i-12];
				742	sum += qlp_coeff[10] * data[i-11];
				743	sum += qlp_coeff[ 9] * data[i-10];
				744	sum += qlp_coeff[ 8] * data[i- 9];
				745	sum += qlp_coeff[ 7] * data[i- 8];
				746	sum += qlp_coeff[ 6] * data[i- 7];
				747	sum += qlp_coeff[ 5] * data[i- 6];
				748	sum += qlp_coeff[ 4] * data[i- 5];
				749	sum += qlp_coeff[ 3] * data[i- 4];
				750	sum += qlp_coeff[ 2] * data[i- 3];
				751	sum += qlp_coeff[ 1] * data[i- 2];
				752	sum += qlp_coeff[ 0] * data[i- 1];
				753	}
				754	residual[i] = data[i] - (sum >> lp_quantization);
				755	}
				756	}
				757	_mm256_zeroupper();
				758	}
				759
				760	static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
				761
				762	FLAC__SSE_TARGET("avx2")
				763	void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
				764	{
				765	int i;
				766	FLAC__int64 sum;
				767	__m128i cnt = _mm_cvtsi32_si128(lp_quantization);
				768	__m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
				769
				770	FLAC__ASSERT(order > 0);
				771	FLAC__ASSERT(order <= 32);
				772	FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
				773
				774	if(order <= 12) {
				775	if(order > 8) {
				776	if(order > 10) {
				777	if(order == 12) {
				778	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
				779	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				780	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				781	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				782	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				783	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				784	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				785	q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
				786	q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
				787	q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
				788	q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
				789	q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
				790	q11 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[11]));
				791
				792	for(i = 0; i < (int)data_len-3; i+=4) {
				793	__m256i summ, mull;
				794	summ = _mm256_mul_epi32(q11, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
				795	mull = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256_add_epi64(summ, mull);
				796	mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
				797	mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
				798	mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
				799	mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
				800	mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
				801	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				802	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				803	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				804	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				805	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				806	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				807	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				808	}
				809	}
				810	else { /* order == 11 */
				811	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
				812	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				813	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				814	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				815	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				816	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				817	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				818	q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
				819	q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
				820	q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
				821	q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
				822	q10 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[10]));
				823
				824	for(i = 0; i < (int)data_len-3; i+=4) {
				825	__m256i summ, mull;
				826	summ = _mm256_mul_epi32(q10, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
				827	mull = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256_add_epi64(summ, mull);
				828	mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
				829	mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
				830	mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
				831	mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
				832	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				833	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				834	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				835	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				836	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				837	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				838	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				839	}
				840	}
				841	}
				842	else {
				843	if(order == 10) {
				844	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
				845	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				846	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				847	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				848	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				849	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				850	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				851	q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
				852	q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
				853	q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
				854	q9 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[9 ]));
				855
				856	for(i = 0; i < (int)data_len-3; i+=4) {
				857	__m256i summ, mull;
				858	summ = _mm256_mul_epi32(q9, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
				859	mull = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256_add_epi64(summ, mull);
				860	mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
				861	mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
				862	mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
				863	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				864	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				865	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				866	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				867	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				868	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				869	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				870	}
				871	}
				872	else { /* order == 9 */
				873	__m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
				874	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				875	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				876	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				877	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				878	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				879	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				880	q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
				881	q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
				882	q8 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[8 ]));
				883
				884	for(i = 0; i < (int)data_len-3; i+=4) {
				885	__m256i summ, mull;
				886	summ = _mm256_mul_epi32(q8, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
				887	mull = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256_add_epi64(summ, mull);
				888	mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
				889	mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
				890	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				891	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				892	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				893	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				894	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				895	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				896	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				897	}
				898	}
				899	}
				900	}
				901	else if(order > 4) {
				902	if(order > 6) {
				903	if(order == 8) {
				904	__m256i q0, q1, q2, q3, q4, q5, q6, q7;
				905	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				906	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				907	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				908	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				909	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				910	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				911	q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
				912	q7 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[7 ]));
				913
				914	for(i = 0; i < (int)data_len-3; i+=4) {
				915	__m256i summ, mull;
				916	summ = _mm256_mul_epi32(q7, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
				917	mull = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256_add_epi64(summ, mull);
				918	mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
				919	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				920	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				921	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				922	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				923	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				924	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				925	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				926	}
				927	}
				928	else { /* order == 7 */
				929	__m256i q0, q1, q2, q3, q4, q5, q6;
				930	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				931	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				932	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				933	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				934	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				935	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				936	q6 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[6 ]));
				937
				938	for(i = 0; i < (int)data_len-3; i+=4) {
				939	__m256i summ, mull;
				940	summ = _mm256_mul_epi32(q6, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
				941	mull = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256_add_epi64(summ, mull);
				942	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				943	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				944	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				945	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				946	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				947	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				948	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				949	}
				950	}
				951	}
				952	else {
				953	if(order == 6) {
				954	__m256i q0, q1, q2, q3, q4, q5;
				955	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				956	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				957	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				958	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				959	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				960	q5 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[5 ]));
				961
				962	for(i = 0; i < (int)data_len-3; i+=4) {
				963	__m256i summ, mull;
				964	summ = _mm256_mul_epi32(q5, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
				965	mull = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256_add_epi64(summ, mull);
				966	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				967	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				968	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				969	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				970	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				971	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				972	}
				973	}
				974	else { /* order == 5 */
				975	__m256i q0, q1, q2, q3, q4;
				976	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				977	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				978	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				979	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				980	q4 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[4 ]));
				981
				982	for(i = 0; i < (int)data_len-3; i+=4) {
				983	__m256i summ, mull;
				984	summ = _mm256_mul_epi32(q4, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
				985	mull = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256_add_epi64(summ, mull);
				986	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				987	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				988	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				989	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				990	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				991	}
				992	}
				993	}
				994	}
				995	else {
				996	if(order > 2) {
				997	if(order == 4) {
				998	__m256i q0, q1, q2, q3;
				999	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				1000	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				1001	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				1002	q3 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[3 ]));
				1003
				1004	for(i = 0; i < (int)data_len-3; i+=4) {
				1005	__m256i summ, mull;
				1006	summ = _mm256_mul_epi32(q3, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
				1007	mull = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256_add_epi64(summ, mull);
				1008	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				1009	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				1010	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				1011	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				1012	}
				1013	}
				1014	else { /* order == 3 */
				1015	__m256i q0, q1, q2;
				1016	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				1017	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				1018	q2 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[2 ]));
				1019
				1020	for(i = 0; i < (int)data_len-3; i+=4) {
				1021	__m256i summ, mull;
				1022	summ = _mm256_mul_epi32(q2, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
				1023	mull = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256_add_epi64(summ, mull);
				1024	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				1025	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				1026	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				1027	}
				1028	}
				1029	}
				1030	else {
				1031	if(order == 2) {
				1032	__m256i q0, q1;
				1033	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				1034	q1 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[1 ]));
				1035
				1036	for(i = 0; i < (int)data_len-3; i+=4) {
				1037	__m256i summ, mull;
				1038	summ = _mm256_mul_epi32(q1, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
				1039	mull = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256_add_epi64(summ, mull);
				1040	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				1041	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				1042	}
				1043	}
				1044	else { /* order == 1 */
				1045	__m256i q0;
				1046	q0 = _mm256_cvtepu32_epi64(_mm_set1_epi32(qlp_coeff[0 ]));
				1047
				1048	for(i = 0; i < (int)data_len-3; i+=4) {
				1049	__m256i summ;
				1050	summ = _mm256_mul_epi32(q0, _mm256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
				1051	summ = _mm256_permutevar8x32_epi32(_mm256_srl_epi64(summ, cnt), pack);
				1052	_mm_storeu_si128((__m128i)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi256_si128(summ)));
				1053	}
				1054	}
				1055	}
				1056	}
				1057	for(; i < (int)data_len; i++) {
				1058	sum = 0;
				1059	switch(order) {
				1060	case 12: sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
				1061	case 11: sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
				1062	case 10: sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
				1063	case 9: sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
				1064	case 8: sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
				1065	case 7: sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
				1066	case 6: sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
				1067	case 5: sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
				1068	case 4: sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
				1069	case 3: sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
				1070	case 2: sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
				1071	case 1: sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
				1072	}
				1073	residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
				1074	}
				1075	}
				1076	else { /* order > 12 */
				1077	for(i = 0; i < (int)data_len; i++) {
				1078	sum = 0;
				1079	switch(order) {
				1080	case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
				1081	case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
				1082	case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
				1083	case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
				1084	case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
				1085	case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
				1086	case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
				1087	case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
				1088	case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
				1089	case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
				1090	case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
				1091	case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
				1092	case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
				1093	case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
				1094	case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
				1095	case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
				1096	case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
				1097	case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
				1098	case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
				1099	case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
				1100	sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
				1101	sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
				1102	sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
				1103	sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
				1104	sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
				1105	sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
				1106	sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
				1107	sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
				1108	sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
				1109	sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
				1110	sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
				1111	sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
				1112	}
				1113	residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
				1114	}
				1115	}
				1116	_mm256_zeroupper();
				1117	}
				1118
				1119	#endif /* FLAC__AVX2_SUPPORTED */
				1120	#endif /* (FLAC__CPU_IA32 \|\| FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
				1121	#endif /* FLAC__NO_ASM */
				1122	#endif /* FLAC__INTEGER_ONLY_LIBRARY */