blob: a36447a2c68270bf7400c393ef70ccf6cde2b08b [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/*****************************************************************************/
21/* */
22/* File Name : ih264_deblk_chroma_ssse3.c */
23/* */
24/* Description : Contains function definitions for deblocking */
25/* */
26/* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */
27/* ih264_deblk_chroma_horz_bs4_ssse3() */
28/* ih264_deblk_chroma_vert_bslt4_ssse3() */
29/* ih264_deblk_chroma_horz_bslt4_ssse3() */
30/* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */
31/* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */
32/* */
33/* Issues / Problems : None */
34/* */
35/* Revision History : */
36/* */
37/* DD MM YYYY Author(s) Changes (Describe the changes made) */
38/* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */
39/* intrinsics */
40/* */
41/*****************************************************************************/
42
43/*****************************************************************************/
44/* File Includes */
45/*****************************************************************************/
46
47/* System include files */
48#include <stdio.h>
49
50/* User include files */
51#include "ih264_typedefs.h"
52#include "ih264_platform_macros.h"
53#include "ih264_deblk_edge_filters.h"
54#include "ih264_macros.h"
55
56/*****************************************************************************/
57/* Function Definitions */
58/*****************************************************************************/
59
60/*****************************************************************************/
61/* */
62/* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */
63/* */
64/* Description : This function performs filtering of a chroma block */
65/* vertical edge when the boundary strength is set to 4 in */
66/* high profile. */
67/* */
68/* Inputs : pu1_src - pointer to the src sample q0 of U */
69/* src_strd - source stride */
70/* alpha_cb - alpha value for the boundary in U */
71/* beta_cb - beta value for the boundary in U */
72/* alpha_cr - alpha value for the boundary in V */
73/* beta_cr - beta value for the boundary in V */
74/* */
75/* Globals : None */
76/* */
77/* Processing : This operation is described in Sec. 8.7.2.4 under the */
78/* title "Filtering process for edges for bS equal to 4" in */
79/* ITU T Rec H.264 with alpha and beta values different in */
80/* U and V. */
81/* */
82/* Outputs : None */
83/* */
84/* Returns : None */
85/* */
86/* Issues : None */
87/* */
88/* Revision History: */
89/* */
90/* DD MM YYYY Author(s) Changes (Describe the changes made) */
91/* 12 02 2015 Naveen Kumar P Initial version */
92/* */
93/*****************************************************************************/
94void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95 WORD32 src_strd,
96 WORD32 alpha_cb,
97 WORD32 beta_cb,
98 WORD32 alpha_cr,
99 WORD32 beta_cr)
100{
101 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105 __m128i temp1, temp2, temp3, temp4;
106
107 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109 __m128i flag1, flag2;
110 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111 __m128i zero = _mm_setzero_si128();
112 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114 /* Load and transpose the pixel values */
115 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124 temp1 = _mm_unpacklo_epi16(linea, lineb);
125 temp2 = _mm_unpacklo_epi16(linec, lined);
126 temp3 = _mm_unpacklo_epi16(linee, linef);
127 temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138 /* End of transpose */
139
140 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146 diff = _mm_abs_epi16(diff);
147 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151 diff = _mm_abs_epi16(diff);
152 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156 diff = _mm_abs_epi16(diff);
157 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162 temp1 = _mm_add_epi16(temp1, temp2);
163 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168 temp1 = _mm_add_epi16(temp1, temp2);
169 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177 diff = _mm_abs_epi16(diff);
178 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182 diff = _mm_abs_epi16(diff);
183 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187 diff = _mm_abs_epi16(diff);
188 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193 temp1 = _mm_add_epi16(temp1, temp2);
194 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199 temp1 = _mm_add_epi16(temp1, temp2);
200 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205 flag1 = _mm_packs_epi16(flag1, flag2);
206
207 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217 /* Inverse-transpose and store back */
218 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223 linea = _mm_unpacklo_epi32(temp1, temp3);
224 lineb = _mm_srli_si128(linea, 8);
225 linec = _mm_unpackhi_epi32(temp1, temp3);
226 lined = _mm_srli_si128(linec, 8);
227 linee = _mm_unpacklo_epi32(temp2, temp4);
228 linef = _mm_srli_si128(linee, 8);
229 lineg = _mm_unpackhi_epi32(temp2, temp4);
230 lineh = _mm_srli_si128(lineg, 8);
231
232 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241}
242
243/*****************************************************************************/
244/* */
245/* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */
246/* */
247/* Description : This function performs filtering of a chroma block */
248/* horizontal edge when the boundary strength is set to 4 */
249/* in high profile. */
250/* */
251/* Inputs : pu1_src - pointer to the src sample q0 of U */
252/* src_strd - source stride */
253/* alpha_cb - alpha value for the boundary in U */
254/* beta_cb - beta value for the boundary in U */
255/* alpha_cr - alpha value for the boundary in V */
256/* beta_cr - beta value for the boundary in V */
257/* */
258/* Globals : None */
259/* */
260/* Processing : This operation is described in Sec. 8.7.2.4 under the */
261/* title "Filtering process for edges for bS equal to 4" in */
262/* ITU T Rec H.264 with alpha and beta values different in */
263/* U and V. */
264/* */
265/* Outputs : None */
266/* */
267/* Returns : None */
268/* */
269/* Issues : None */
270/* */
271/* Revision History: */
272/* */
273/* DD MM YYYY Author(s) Changes (Describe the changes made) */
274/* 12 02 2015 Naveen Kumar P Initial version */
275/* */
276/*****************************************************************************/
277void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278 WORD32 src_strd,
279 WORD32 alpha_cb,
280 WORD32 beta_cb,
281 WORD32 alpha_cr,
282 WORD32 beta_cr)
283{
284 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285 WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292 __m128i flag1, flag2;
293 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294 __m128i zero = _mm_setzero_si128();
295 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296 __m128i temp1, temp2;
297
298 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300 i16_posQ1 = src_strd;
301 i16_posP0 = src_strd;
302 i16_posP1 = 0;
303
304 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315 diff = _mm_abs_epi16(diff);
316 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320 diff = _mm_abs_epi16(diff);
321 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325 diff = _mm_abs_epi16(diff);
326 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331 temp1 = _mm_add_epi16(temp1, temp2);
332 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337 temp1 = _mm_add_epi16(temp1, temp2);
338 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346 diff = _mm_abs_epi16(diff);
347 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351 diff = _mm_abs_epi16(diff);
352 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356 diff = _mm_abs_epi16(diff);
357 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362 temp1 = _mm_add_epi16(temp1, temp2);
363 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368 temp1 = _mm_add_epi16(temp1, temp2);
369 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374 flag1 = _mm_packs_epi16(flag1, flag2);
375
376 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388}
389
390/*****************************************************************************/
391/* */
392/* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */
393/* */
394/* Description : This function performs filtering of a chroma block */
395/* vertical edge when the boundary strength is less than 4 */
396/* in high profile. */
397/* */
398/* Inputs : pu1_src - pointer to the src sample q0 of U */
399/* src_strd - source stride */
400/* alpha_cb - alpha value for the boundary in U */
401/* beta_cb - beta value for the boundary in U */
402/* alpha_cr - alpha value for the boundary in V */
403/* beta_cr - beta value for the boundary in V */
404/* u4_bs - packed Boundary strength array */
405/* pu1_cliptab_cb - tc0_table for U */
406/* pu1_cliptab_cr - tc0_table for V */
407/* */
408/* Globals : None */
409/* */
410/* Processing : This operation is described in Sec. 8.7.2.3 under the */
411/* title "Filtering process for edges for bS less than 4" */
412/* in ITU T Rec H.264 with alpha and beta values different */
413/* in U and V. */
414/* */
415/* Outputs : None */
416/* */
417/* Returns : None */
418/* */
419/* Issues : None */
420/* */
421/* Revision History: */
422/* */
423/* DD MM YYYY Author(s) Changes (Describe the changes made) */
424/* 12 02 2015 Naveen Kumar P Initial version */
425/* */
426/*****************************************************************************/
427void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428 WORD32 src_strd,
429 WORD32 alpha_cb,
430 WORD32 beta_cb,
431 WORD32 alpha_cr,
432 WORD32 beta_cr,
433 UWORD32 u4_bs,
434 const UWORD8 *pu1_cliptab_cb,
435 const UWORD8 *pu1_cliptab_cr)
436{
437 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442 __m128i temp1, temp2, temp3, temp4;
443
444 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446 __m128i flag_bs, flag1, flag2;
447 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448 __m128i zero = _mm_setzero_si128();
449 __m128i C0_uv_8x16;
450 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452 u1_Bs0 = (u4_bs >> 24) & 0xff;
453 u1_Bs1 = (u4_bs >> 16) & 0xff;
454 u1_Bs2 = (u4_bs >> 8) & 0xff;
455 u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463 /* Load and transpose the pixel values */
464 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473 temp1 = _mm_unpacklo_epi16(linea, lineb);
474 temp2 = _mm_unpacklo_epi16(linec, lined);
475 temp3 = _mm_unpacklo_epi16(linee, linef);
476 temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487 /* End of transpose */
488
489 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495 diff = _mm_abs_epi16(diff);
496 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500 diff = _mm_abs_epi16(diff);
501 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505 diff = _mm_abs_epi16(diff);
506 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509 diff = _mm_slli_epi16(diff, 2);
510 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511 diff = _mm_add_epi16(diff, diff1);
512 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513 in_macro = _mm_srai_epi16(diff, 3);
514
515 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535 diff = _mm_abs_epi16(diff);
536 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540 diff = _mm_abs_epi16(diff);
541 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545 diff = _mm_abs_epi16(diff);
546 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549 diff = _mm_slli_epi16(diff, 2);
550 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551 diff = _mm_add_epi16(diff, diff1);
552 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553 in_macro = _mm_srai_epi16(diff, 3);
554
555 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572 flag1 = _mm_packs_epi16(flag1, flag2);
573 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585 /* Inverse-transpose and store back */
586 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591 linea = _mm_unpacklo_epi32(temp1, temp3);
592 lineb = _mm_srli_si128(linea, 8);
593 linec = _mm_unpackhi_epi32(temp1, temp3);
594 lined = _mm_srli_si128(linec, 8);
595 linee = _mm_unpacklo_epi32(temp2, temp4);
596 linef = _mm_srli_si128(linee, 8);
597 lineg = _mm_unpackhi_epi32(temp2, temp4);
598 lineh = _mm_srli_si128(lineg, 8);
599
600 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609}
610
611/*****************************************************************************/
612/* */
613/* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */
614/* */
615/* Description : This function performs filtering of a chroma block */
616/* horizontal edge when the boundary strength is less than */
617/* 4 in high profile. */
618/* */
619/* Inputs : pu1_src - pointer to the src sample q0 of U */
620/* src_strd - source stride */
621/* alpha_cb - alpha value for the boundary in U */
622/* beta_cb - beta value for the boundary in U */
623/* alpha_cr - alpha value for the boundary in V */
624/* beta_cr - beta value for the boundary in V */
625/* u4_bs - packed Boundary strength array */
626/* pu1_cliptab_cb - tc0_table for U */
627/* pu1_cliptab_cr - tc0_table for V */
628/* */
629/* Globals : None */
630/* */
631/* Processing : This operation is described in Sec. 8.7.2.3 under the */
632/* title "Filtering process for edges for bS less than 4" */
633/* in ITU T Rec H.264 with alpha and beta values different */
634/* in U and V. */
635/* */
636/* Outputs : None */
637/* */
638/* Returns : None */
639/* */
640/* Issues : None */
641/* */
642/* Revision History: */
643/* */
644/* DD MM YYYY Author(s) Changes (Describe the changes made) */
645/* 12 02 2015 Naveen Kumar P Initial version */
646/* */
647/*****************************************************************************/
648void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649 WORD32 src_strd,
650 WORD32 alpha_cb,
651 WORD32 beta_cb,
652 WORD32 alpha_cr,
653 WORD32 beta_cr,
654 UWORD32 u4_bs,
655 const UWORD8 *pu1_cliptab_cb,
656 const UWORD8 *pu1_cliptab_cr)
657{
658 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659 WORD16 i16_posP1, i16_posP0, i16_posQ1;
660 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667 __m128i flag_bs, flag1, flag2;
668 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669 __m128i zero = _mm_setzero_si128();
670 __m128i C0_uv_8x16;
671 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675 i16_posQ1 = src_strd;
676 i16_posP0 = src_strd;
677 i16_posP1 = 0;
678
679 u1_Bs0 = (u4_bs >> 24) & 0xff;
680 u1_Bs1 = (u4_bs >> 16) & 0xff;
681 u1_Bs2 = (u4_bs >> 8) & 0xff;
682 u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701 diff = _mm_abs_epi16(diff);
702 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706 diff = _mm_abs_epi16(diff);
707 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711 diff = _mm_abs_epi16(diff);
712 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715 diff = _mm_slli_epi16(diff, 2);
716 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717 diff = _mm_add_epi16(diff, diff1);
718 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719 in_macro = _mm_srai_epi16(diff, 3);
720
721 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741 diff = _mm_abs_epi16(diff);
742 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746 diff = _mm_abs_epi16(diff);
747 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751 diff = _mm_abs_epi16(diff);
752 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755 diff = _mm_slli_epi16(diff, 2);
756 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757 diff = _mm_add_epi16(diff, diff1);
758 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759 in_macro = _mm_srai_epi16(diff, 3);
760
761 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778 flag1 = _mm_packs_epi16(flag1, flag2);
779 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793}
794
795/*****************************************************************************/
796/* */
797/* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */
798/* */
799/* Description : This function performs filtering of a chroma block */
800/* vertical edge when boundary strength is set to 4 in high */
801/* profile. */
802/* */
803/* Inputs : pu1_src - pointer to the src sample q0 of U */
804/* src_strd - source stride */
805/* alpha_cb - alpha value for the boundary in U */
806/* beta_cb - beta value for the boundary in U */
807/* alpha_cr - alpha value for the boundary in V */
808/* beta_cr - beta value for the boundary in V */
809/* u4_bs - packed Boundary strength array */
810/* pu1_cliptab_cb - tc0_table for U */
811/* pu1_cliptab_cr - tc0_table for V */
812/* */
813/* Globals : None */
814/* */
815/* Processing : When the function is called twice, this operation is as */
816/* described in Sec. 8.7.2.4 under the title "Filtering */
817/* process for edges for bS equal to 4" in ITU T Rec H.264 */
818/* with alpha and beta values different in U and V. */
819/* */
820/* Outputs : None */
821/* */
822/* Returns : None */
823/* */
824/* Issues : None */
825/* */
826/* Revision History: */
827/* */
828/* DD MM YYYY Author(s) Changes (Describe the changes made) */
829/* 12 02 2015 Naveen Kumar P Initial version */
830/* */
831/*****************************************************************************/
832void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833 WORD32 src_strd,
834 WORD32 alpha_cb,
835 WORD32 beta_cb,
836 WORD32 alpha_cr,
837 WORD32 beta_cr)
838{
839 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842 __m128i linea, lineb, linec, lined;
843 __m128i temp1, temp2;
844
845 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847 __m128i flag1;
848 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849 __m128i zero = _mm_setzero_si128();
850 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852 /* Load and transpose the pixel values */
853 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858 temp1 = _mm_unpacklo_epi16(linea, lineb);
859 temp2 = _mm_unpacklo_epi16(linec, lined);
860
861 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865 /* End of transpose */
866
867 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873 diff = _mm_abs_epi16(diff);
874 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878 diff = _mm_abs_epi16(diff);
879 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883 diff = _mm_abs_epi16(diff);
884 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889 temp1 = _mm_add_epi16(temp1, temp2);
890 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895 temp1 = _mm_add_epi16(temp1, temp2);
896 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901 flag1 = _mm_packs_epi16(flag1, flag1);
902
903 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913 /* Inverse-transpose and store back */
914 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917 linea = _mm_unpacklo_epi32(temp1, temp2);
918 lineb = _mm_srli_si128(linea, 8);
919 linec = _mm_unpackhi_epi32(temp1, temp2);
920 lined = _mm_srli_si128(linec, 8);
921
922 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927}
928
929/*****************************************************************************/
930/* */
931/* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */
932/* */
933/* Description : This function performs filtering of a chroma block */
934/* vertical edge when boundary strength is less than 4 in */
935/* high profile. */
936/* */
937/* Inputs : pu1_src - pointer to the src sample q0 of U */
938/* src_strd - source stride */
939/* alpha_cb - alpha value for the boundary in U */
940/* beta_cb - beta value for the boundary in U */
941/* alpha_cr - alpha value for the boundary in V */
942/* beta_cr - beta value for the boundary in V */
943/* u4_bs - packed Boundary strength array */
944/* pu1_cliptab_cb - tc0_table for U */
945/* pu1_cliptab_cr - tc0_table for V */
946/* */
947/* Globals : None */
948/* */
949/* Processing : When the function is called twice, this operation is as */
950/* described in Sec. 8.7.2.4 under the title "Filtering */
951/* process for edges for bS less than 4" in ITU T Rec H.264 */
952/* with alpha and beta values different in U and V. */
953/* */
954/* Outputs : None */
955/* */
956/* Returns : None */
957/* */
958/* Issues : None */
959/* */
960/* Revision History: */
961/* */
962/* DD MM YYYY Author(s) Changes (Describe the changes made) */
963/* 12 02 2015 Naveen Kumar P Initial version */
964/* */
965/*****************************************************************************/
966void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967 WORD32 src_strd,
968 WORD32 alpha_cb,
969 WORD32 beta_cb,
970 WORD32 alpha_cr,
971 WORD32 beta_cr,
972 UWORD32 u4_bs,
973 const UWORD8 *pu1_cliptab_cb,
974 const UWORD8 *pu1_cliptab_cr)
975{
976 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980 __m128i linea, lineb, linec, lined;
981 __m128i temp1, temp2;
982
983 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985 __m128i flag_bs, flag1;
986 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987 __m128i zero = _mm_setzero_si128();
988 __m128i C0_uv_8x16;
989 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991 u1_Bs0 = (u4_bs >> 24) & 0xff;
992 u1_Bs1 = (u4_bs >> 16) & 0xff;
993 u1_Bs2 = (u4_bs >> 8) & 0xff;
994 u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996 flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001 /* Load and transpose the pixel values */
1002 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007 temp1 = _mm_unpacklo_epi16(linea, lineb);
1008 temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014 /* End of transpose */
1015
1016 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022 diff = _mm_abs_epi16(diff);
1023 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027 diff = _mm_abs_epi16(diff);
1028 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032 diff = _mm_abs_epi16(diff);
1033 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036 diff = _mm_slli_epi16(diff, 2);
1037 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038 diff = _mm_add_epi16(diff, diff1);
1039 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040 in_macro = _mm_srai_epi16(diff, 3);
1041
1042 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059 flag1 = _mm_packs_epi16(flag1, flag1);
1060 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072 /* Inverse-transpose and store back */
1073 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076 linea = _mm_unpacklo_epi32(temp1, temp2);
1077 lineb = _mm_srli_si128(linea, 8);
1078 linec = _mm_unpackhi_epi32(temp1, temp2);
1079 lined = _mm_srli_si128(linec, 8);
1080
1081 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086}
1087