blob: 6a3883ea771d00e00c6ada0a550f8d3e760b8cf5 [file] [log] [blame]
Harish Mahendrakar0d8951c2014-05-16 10:31:13 -07001/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19*******************************************************************************
20* @file
21* ihevc_chroma_intra_pred_filters_atom_intr.c
22*
23* @brief
24* Contains function Definition for intra prediction interpolation filters
25*
26*
27* @author
28* Ittiam
29*
30* @par List of Functions:
31* ihevc_intra_pred_chroma_planar_ssse3()
32*
33* ihevc_intra_pred_chroma_dc_ssse3()
34*
35* ihevc_intra_pred_chroma_horz_ssse3()
36*
37* ihevc_intra_pred_chroma_ver_ssse3()
38*
39* ihevc_intra_pred_chroma_mode2_ssse3()
40*
41* ihevc_intra_pred_chroma_mode_18_34_ssse3()
42*
43* ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
44*
45* ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
46*
47* ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
48*
49* ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
50*
51*
52*
53* @remarks
54* None
55*
56*******************************************************************************
57*/
58
59
60/*****************************************************************************/
61/* File Includes */
62/*****************************************************************************/
63
64#include "ihevc_typedefs.h"
65#include "ihevc_platform_macros.h"
66#include "ihevc_macros.h"
67#include "ihevc_func_selector.h"
68#include "ihevc_intra_pred.h"
69
70#include "ihevc_chroma_intra_pred.h"
71#include "ihevc_common_tables.h"
72#include "ihevc_tables_x86_intr.h"
73
74#include <mmintrin.h>
75#include <xmmintrin.h>
76#include <emmintrin.h>
77
78#include <immintrin.h>
79
80
81/****************************************************************************/
82/* Constant Macros */
83/****************************************************************************/
84#define MAX_CU_SIZE 64
85#define BIT_DEPTH 8
86#define T32_4NT 128
87#define T16_4NT 64
88#define T16C_4NT 64
89#define T8C_4NT 32
90/****************************************************************************/
91/* Function Macros */
92/****************************************************************************/
93
94#define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
95
96/* tables to shuffle 8-bit values */
97
98/*****************************************************************************/
99/* Function Definition */
100/*****************************************************************************/
101
102
103
104/**
105*******************************************************************************
106*
107* @brief
108* Planar Intraprediction with reference neighboring samples location
109* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
110* to section 8.4.4.2.4 in the standard
111*
112* @par Description:
113*
114*
115* @param[in] pu1_src
116* UWORD8 pointer to the source
117*
118* @param[in] pu1_dst
119* UWORD8 pointer to the destination
120*
121* @param[in] src_strd
122* integer source stride
123*
124* @param[in] dst_strd
125* integer destination stride
126*
127* @param[in] nt
128* integer Transform Block size
129*
130* @param[in] mode
131* integer intraprediction mode
132*
133* @returns
134*
135* @remarks
136* None
137*
138*******************************************************************************
139*/
140
141void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
142 WORD32 src_strd,
143 UWORD8 *pu1_dst,
144 WORD32 dst_strd,
145 WORD32 nt,
146 WORD32 mode)
147{
148
149 WORD32 row, col;
150 WORD32 log2nt = 5;
151 WORD32 two_nt, three_nt;
152
153 __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
154 __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
155 UNUSED(src_strd);
156 UNUSED(mode);
157 switch(nt)
158 {
159 case 16:
160 log2nt = 4;
161 break;
162 case 8:
163 log2nt = 3;
164 break;
165 case 4:
166 log2nt = 2;
167 break;
168 default:
169 break;
170 }
171 two_nt = 2 * nt;
172 three_nt = 3 * nt;
173
174 /* Planar filtering */
175
176/* setting vallues in registera*/
177
178// pu1_ref[2*(two_nt - 1 - row)]
179// pu1_ref[2 * (three_nt + 1)]
180// pu1_ref[2 * (two_nt + 1) + col]
181// pu1_ref[2 * (nt - 1)]
182
183 const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
184 pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
185 pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
186
187 const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
188 pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
189
190 const_temp4_4x32b = _mm_set1_epi16(nt - 1);
191 const_temp6_4x32b = _mm_set1_epi16(nt);
192 const_temp7_4x32b = _mm_set1_epi16(4);
193
194 zero_8x16b = _mm_set1_epi32(0);
195
196
197 if(nt % 4 == 0)
198 {
199 const_temp7_4x32b = _mm_set1_epi16(4);
200
201 for(row = 0; row < nt; row++)
202 {
203 __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
204 __m128i res_temp3_8x16b;
205
206 const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
207 pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
208 pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
209
210 const_temp3_4x32b = _mm_set1_epi16((row + 1));
211 row_8x16b = _mm_set1_epi16((nt - 1 - row));
212
213 const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
214 col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
215
216 const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
217
218 /*(row + 1) * pu1_ref[nt - 1]*/
219 res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b);
220
221 /*(row + 1) * pu1_ref[nt - 1] + nt)*/
222 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
223
224 for(col = 0; col < 2 * nt; col += 8)
225 {
226 __m128i src_temp_8x16b;
227
228 /* loding 8bit 16 pixles*/
229 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
230
231 //src_temp_8x16b = _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
232 src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);
233
234 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
235 res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b);
236
237 /*(col + 1) * pu1_ref[three_nt + 1]*/
238 res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b);
239
240 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
241 res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b);
242
243 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
244 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
245 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
246
247 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
248 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
249
250 _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
251
252 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
253 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
254 } /* inner loop ends here */
255 }
256 }
257}
258
259
260/**
261*******************************************************************************
262*
263* @brief
264* Intraprediction for DC mode with reference neighboring samples location
265* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
266* to section 8.4.4.2.5 in the standard
267*
268* @par Description:
269*
270*
271* @param[in] pu1_src
272* UWORD8 pointer to the source
273*
274* @param[in] pu1_dst
275* UWORD8 pointer to the destination
276*
277* @param[in] src_strd
278* integer source stride
279*
280* @param[in] dst_strd
281* integer destination stride
282*
283* @param[in] nt
284* integer Transform Block size (Chroma)
285*
286* @param[in] mode
287* integer intraprediction mode
288*
289* @returns
290*
291* @remarks
292* None
293*
294*******************************************************************************
295*/
296
297void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
298 WORD32 src_strd,
299 UWORD8 *pu1_dst,
300 WORD32 dst_strd,
301 WORD32 nt,
302 WORD32 mode)
303{
304
305 WORD32 acc_dc_u, acc_dc_v;
306 WORD32 dc_val_u, dc_val_v;
307 WORD32 row;
308 WORD32 log2nt = 5;
309 __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
310 __m128i src_temp7, src_temp8, src_temp9, src_temp10;
311 __m128i m_zero = _mm_set1_epi32(0);
312 UNUSED(src_strd);
313 UNUSED(mode);
314
315 switch(nt)
316 {
317 case 32:
318 log2nt = 5;
319 break;
320 case 16:
321 log2nt = 4;
322 break;
323 case 8:
324 log2nt = 3;
325 break;
326 case 4:
327 log2nt = 2;
328 break;
329 default:
330 break;
331 }
332
333 acc_dc_u = 0;
334 acc_dc_v = 0;
335
336 /* Calculate DC value for the transform block */
337
338 m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
339
340 if(nt == 16)
341 {
342 __m128i temp_sad, sign_8x16b;
343
344 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
345 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
346 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
347 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
348
349 src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
350 src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
351 src_temp9 = _mm_unpacklo_epi8(src_temp7, m_zero);
352 src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);
353
354 src_temp3 = _mm_srli_si128(src_temp3, 8);
355 src_temp4 = _mm_srli_si128(src_temp4, 8);
356 src_temp7 = _mm_srli_si128(src_temp7, 8);
357 src_temp8 = _mm_srli_si128(src_temp8, 8);
358
359 src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
360 src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
361 src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
362 src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);
363
364 src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
365 src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
366 src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
367 src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
368
369 src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
370 src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
371
372 src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
373 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
374 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
375 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
376
377 sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
378 src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
379
380 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
381 acc_dc_u = _mm_cvtsi128_si32(src_temp4);
382 acc_dc_v = _mm_cvtsi128_si32(temp_sad);
383 }
384
385 else if(nt == 8)
386 {
387 __m128i temp_sad, sign_8x16b;
388 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
389 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
390
391 src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
392 src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
393
394 src_temp3 = _mm_srli_si128(src_temp3, 8);
395 src_temp4 = _mm_srli_si128(src_temp4, 8);
396
397 src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
398 src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
399
400 src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
401 src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
402
403 src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
404 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
405 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
406 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
407
408 sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
409 src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
410
411 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
412 acc_dc_u = _mm_cvtsi128_si32(src_temp4);
413 acc_dc_v = _mm_cvtsi128_si32(temp_sad);
414 }
415
416 else if(nt == 4)
417 {
418 __m128i temp_sad, sign_8x16b;
419 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
420
421 src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
422 src_temp4 = _mm_srli_si128(src_temp3, 8);
423
424 src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
425
426 src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
427
428 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
429 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
430 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
431
432 sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
433 src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
434
435 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
436 acc_dc_u = _mm_cvtsi128_si32(src_temp4);
437 acc_dc_v = _mm_cvtsi128_si32(temp_sad);
438 }
439
440
441 acc_dc_u += pu1_ref[6 * nt];
442 acc_dc_v += pu1_ref[6 * nt + 1];
443
444 acc_dc_u -= pu1_ref[4 * nt];
445 acc_dc_v -= pu1_ref[4 * nt + 1];
446
447 dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
448 dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
449
450 dc_val_u = dc_val_u | (dc_val_v << 8);
451
452 /* Fill the remaining rows with DC value*/
453
454 if(nt == 4)
455 {
456 src_temp1 = _mm_set1_epi16(dc_val_u);
457
458 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
459 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
460 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
461 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
462 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
463
464 }
465 else if(nt == 8)
466 {
467 src_temp1 = _mm_set1_epi16(dc_val_u);
468
469 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
470 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
471 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
472 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
473 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
474
475 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
476 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
477 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
478 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
479
480 }
481
482 else /* nt == 16 */
483 {
484 src_temp1 = _mm_set1_epi16(dc_val_u);
485
486 for(row = 0; row < nt; row += 8)
487 {
488 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
489 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
490 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
491 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
492 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
493 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
494 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
495 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
496 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
497
498 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
499 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
500 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
501 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
502 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
503 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
504 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
505 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
506
507 pu1_dst += 8 * dst_strd;
508 }
509 }
510
511}
512
513
514/**
515*******************************************************************************
516*
517* @brief
518* Horizontal intraprediction(mode 10) with reference samples location
519* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
520* to section 8.4.4.2.6 in the standard (Special case)
521*
522* @par Description:
523*
524*
525* @param[in] pu1_src
526* UWORD8 pointer to the source
527*
528* @param[in] pu1_dst
529* UWORD8 pointer to the destination
530*
531* @param[in] src_strd
532* integer source stride
533*
534* @param[in] dst_strd
535* integer destination stride
536*
537* @param[in] nt
538* integer Transform Block size
539*
540* @param[in] mode
541* integer intraprediction mode
542*
543* @returns
544*
545* @remarks
546* None
547*
548*******************************************************************************
549*/
550
551void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
552 WORD32 src_strd,
553 UWORD8 *pu1_dst,
554 WORD32 dst_strd,
555 WORD32 nt,
556 WORD32 mode)
557{
558
559 WORD32 row;
560 __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
561 UNUSED(src_strd);
562 UNUSED(mode);
563
564 /* Replication to next rows*/
565
566 if(nt == 8)
567 {
568 for(row = 0; row < nt; row += 4)
569 {
570 temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
571 temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
572 temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
573 temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
574 temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
575 temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
576 temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
577 temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
578
579 temp2 = _mm_unpacklo_epi8(temp1, temp2);
580 temp4 = _mm_unpacklo_epi8(temp3, temp4);
581 temp6 = _mm_unpacklo_epi8(temp5, temp6);
582 temp8 = _mm_unpacklo_epi8(temp7, temp8);
583
584 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
585 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
586 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
587 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);
588
589 }
590 }
591 else if(nt == 16)
592 {
593 for(row = 0; row < nt; row += 4)
594 {
595 temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
596 temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
597
598 temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
599 temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
600
601 temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
602 temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
603
604 temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
605 temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
606
607 temp2 = _mm_unpacklo_epi8(temp1, temp2);
608 temp4 = _mm_unpacklo_epi8(temp3, temp4);
609 temp6 = _mm_unpacklo_epi8(temp5, temp6);
610 temp8 = _mm_unpacklo_epi8(temp7, temp8);
611
612 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
613 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);
614
615 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
616 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);
617
618 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
619 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);
620
621 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
622 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);
623
624
625 }
626 }
627 else
628 {
629 temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
630 temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);
631
632 temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
633 temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);
634
635 temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
636 temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);
637
638 temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
639 temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);
640
641 temp2 = _mm_unpacklo_epi8(temp1, temp2);
642 temp4 = _mm_unpacklo_epi8(temp3, temp4);
643 temp6 = _mm_unpacklo_epi8(temp5, temp6);
644 temp8 = _mm_unpacklo_epi8(temp7, temp8);
645
646 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
647 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
648 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
649 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
650 }
651}
652
653
654/**
655*******************************************************************************
656*
657* @brief
658* Horizontal intraprediction with reference neighboring samples location
659* pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer
660* to section 8.4.4.2.6 in the standard (Special case)
661*
662* @par Description:
663*
664*
665* @param[in] pu1_src
666* UWORD8 pointer to the source
667*
668* @param[in] pu1_dst
669* UWORD8 pointer to the destination
670*
671* @param[in] src_strd
672* integer source stride
673*
674* @param[in] dst_strd
675* integer destination stride
676*
677* @param[in] nt
678* integer Transform Block size
679*
680* @param[in] mode
681* integer intraprediction mode
682*
683* @returns
684*
685* @remarks
686* None
687*
688*******************************************************************************
689*/
690
691void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
692 WORD32 src_strd,
693 UWORD8 *pu1_dst,
694 WORD32 dst_strd,
695 WORD32 nt,
696 WORD32 mode)
697{
698 __m128i src_temp1;
699 UNUSED(src_strd);
700 UNUSED(mode);
701
702 /* Replication to next columns*/
703 if(nt == 8)
704 {
705 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
706
707 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
708 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
709 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
710 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);
711
712 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
713 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
714 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
715 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);
716
717 }
718 if(nt == 16)
719 {
720 __m128i temp1, temp2;
721
722 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
723 temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));
724
725 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/
726 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
727 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
728 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
729 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
730 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
731 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
732 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
733 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
734
735 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
736 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
737 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
738 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
739 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
740 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
741 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
742 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
743
744 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
745 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
746 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
747 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
748 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
749 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
750 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
751 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
752
753 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
754 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
755 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
756 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
757 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
758 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
759 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
760 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
761
762 }
763 else
764 {
765 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
766
767 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
768 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
769 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
770 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
771
772
773 }
774
775}
776
777/**
778*******************************************************************************
779*
780* @brief
781* Intraprediction for mode 2 (sw angle) with reference neighboring samples
782* location pointed by 'pu1_ref' to the TU block location pointed by
783* 'pu1_dst' Refer to section 8.4.4.2.6 in the standard
784*
785* @par Description:
786*
787*
788* @param[in] pu1_src
789* UWORD8 pointer to the source
790*
791* @param[in] pu1_dst
792* UWORD8 pointer to the destination
793*
794* @param[in] src_strd
795* integer source stride
796*
797* @param[in] dst_strd
798* integer destination stride
799*
800* @param[in] nt
801* integer Transform Block size
802*
803* @param[in] mode
804* integer intraprediction mode
805*
806* @returns
807*
808* @remarks
809* None
810*
811*******************************************************************************
812*/
813
814void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
815 WORD32 src_strd,
816 UWORD8 *pu1_dst,
817 WORD32 dst_strd,
818 WORD32 nt,
819 WORD32 mode)
820{
821 WORD32 row, col;
822
823
824 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
825 UNUSED(src_strd);
826 UNUSED(mode);
827
828 sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
829 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);
830
831 /* For the angle 45, replication is done from the corresponding angle */
832 /* intra_pred_ang = tan(angle) in q5 format */
833
834 if(nt == 4)
835 {
836 /*pu1_ref[two_nt - row - (col+1) - 1]*/
837 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
838 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
839 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
840 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));
841
842 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
843 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
844 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
845 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));
846
847 }
848 else if(nt == 8)
849 {
850 /*pu1_ref[two_nt - row - (col+1) - 1]*/
851 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
852 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
853 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
854 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
855 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
856 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
857 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
858 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));
859
860 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
861 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
862 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
863 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
864 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
865 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
866 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
867 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
868
869
870 }
871 else
872 {
873 for(row = 0; row < nt; row += 8)
874 {
875 for(col = 0; col < 2 * nt; col += 16)
876 { /*pu1_ref[two_nt - row - (col+1) - 1]*/
877 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
878 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
879 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
880 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
881 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
882 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
883 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
884 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));
885
886 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
887 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
888 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
889 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
890 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
891 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
892 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
893 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
894 }
895 }
896 }
897}
898
899/**
900*******************************************************************************
901*
902* @brief
903* Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with
904* reference neighboring samples location pointed by 'pu1_ref' to the TU
905* block location pointed by 'pu1_dst'
906*
907* @par Description:
908*
909*
910* @param[in] pu1_src
911* UWORD8 pointer to the source
912*
913* @param[in] pu1_dst
914* UWORD8 pointer to the destination
915*
916* @param[in] src_strd
917* integer source stride
918*
919* @param[in] dst_strd
920* integer destination stride
921*
922* @param[in] nt
923* integer Transform Block size
924*
925* @param[in] mode
926* integer intraprediction mode
927*
928* @returns
929*
930* @remarks
931* None
932*
933*******************************************************************************
934*/
935
936void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
937 WORD32 src_strd,
938 UWORD8 *pu1_dst,
939 WORD32 dst_strd,
940 WORD32 nt,
941 WORD32 mode)
942{
943 WORD32 row;
944 WORD32 idx = 0;
945
946 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
947 UNUSED(src_strd);
948
949 if(mode == 34)
950 {
951 if(nt == 4)
952 {
953 /*pu1_ref[two_nt + col + idx + 1]*/
954 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
955 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
956 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
957 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
958
959 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
960 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
961 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
962 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
963
964 }
965 else if(nt == 8)
966 {
967 /*pu1_ref[two_nt + col + idx + 1]*/
968 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
969 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
970 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
971 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
972 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
973 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
974 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
975 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
976
977 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
978 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
979 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
980 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
981 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
982 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
983 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
984 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
985
986
987 }
988 else
989 {
990 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
991 for(row = 0; row < nt; row += 8)
992 {
993 /*pu1_ref[two_nt + col + idx + 1]*/
994 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2));
995 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
996 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2));
997 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
998 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2));
999 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1000 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1001 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1002
1003 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
1004 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
1005 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
1006 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
1007 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
1008 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
1009 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
1010 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
1011
1012 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1013 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1014 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1015 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1016 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1017 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1018 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1019 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1020
1021 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
1022 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
1023 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
1024 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
1025 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
1026 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
1027 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
1028 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
1029
1030 pu1_ref += 2 * 8;
1031 pu1_dst += 8 * dst_strd;
1032 }
1033 }
1034 }
1035 else
1036 {
1037 if(nt == 4)
1038 {
1039 /*pu1_ref[two_nt + col + idx + 1]*/
1040 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
1041 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
1042 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
1043 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
1044
1045 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
1046 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
1047 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
1048 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
1049
1050
1051 }
1052 else if(nt == 8)
1053 {
1054 /*pu1_ref[two_nt + col + idx + 1]*/
1055 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
1056 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
1057 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
1058 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
1059 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
1060 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
1061 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
1062 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
1063
1064 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
1065 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
1066 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
1067 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
1068 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
1069 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
1070 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
1071 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
1072
1073
1074 }
1075 else
1076 {
1077 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
1078 for(row = 0; row < nt; row += 8)
1079 {
1080 /*pu1_ref[two_nt + col + idx + 1]*/
1081 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1082 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1083 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1084 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1085 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1086 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1087 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1088 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1089
1090 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1);
1091 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
1092 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2);
1093 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
1094 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3);
1095 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
1096 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4);
1097 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
1098
1099 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1100 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1101 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1102 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1103 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1104 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1105 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2));
1106 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
1107
1108 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5);
1109 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
1110 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6);
1111 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
1112 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7);
1113 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
1114 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8);
1115 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
1116
1117 pu1_ref -= 2 * 8;
1118 pu1_dst += 8 * dst_strd;
1119 }
1120 }
1121 }
1122
1123}
1124
1125/**
1126*******************************************************************************
1127*
1128* @brief
1129* Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
1130* reference neighboring samples location pointed by 'pu1_ref' to the TU
1131* block location pointed by 'pu1_dst'
1132*
1133* @par Description:
1134*
1135*
1136* @param[in] pu1_src
1137* UWORD8 pointer to the source
1138*
1139* @param[in] pu1_dst
1140* UWORD8 pointer to the destination
1141*
1142* @param[in] src_strd
1143* integer source stride
1144*
1145* @param[in] dst_strd
1146* integer destination stride
1147*
1148* @param[in] nt
1149* integer Transform Block size
1150*
1151* @param[in] mode
1152* integer intraprediction mode
1153*
1154* @returns
1155*
1156* @remarks
1157* None
1158*
1159*******************************************************************************
1160*/
1161
1162void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
1163 WORD32 src_strd,
1164 UWORD8 *pu1_dst,
1165 WORD32 dst_strd,
1166 WORD32 nt,
1167 WORD32 mode)
1168{
1169 WORD32 row, col;
1170
1171 WORD32 intra_pred_ang;
1172
1173 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
1174 __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
1175 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
1176 UNUSED(src_strd);
1177
1178 /* Intra Pred Angle according to the mode */
1179 intra_pred_ang = gai4_ihevc_ang_table[mode];
1180
1181 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
1182 /* samples dependent on distance to obtain destination sample */
1183
1184 sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
1185 const_temp_4x32b = _mm_set1_epi16(16);
1186 const_temp2_4x32b = _mm_set1_epi32(31);
1187 const_temp3_4x32b = _mm_set1_epi16(32);
1188 const_temp4_4x32b = _mm_set1_epi32(4);
1189
1190 two_nt_4x32b = _mm_set1_epi32(1);
1191
1192 zero_8x16b = _mm_set1_epi16(0);
1193
1194
1195 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
1196 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
1197
1198 row_4x32b = _mm_set_epi32(4, 3, 2, 1);
1199
1200 if(nt == 4)
1201 {
1202 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1203 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1204 const_temp2_4x32b = _mm_set1_epi16(31);
1205 const_temp4_4x32b = _mm_set1_epi16(4);
1206 two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
1207
1208 {
1209 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1210 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
1211
1212 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1213 __m128i src_values10;
1214
1215 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1216
1217 /* pos = ((row + 1) * intra_pred_ang); */
1218 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1219
1220 /* fract = pos & (31); */
1221 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1222
1223 ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5);
1224
1225 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
1226
1227 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
1228
1229 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1230
1231 /*(32 - fract) */
1232 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1233
1234 _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1235 _mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10);
1236
1237 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
1238 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
1239 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
1240 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
1241
1242 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
1243 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
1244 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
1245 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
1246
1247 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1248 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1249 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1250 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1251
1252 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
1253 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
1254 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
1255 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
1256
1257 {
1258 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1259 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1260
1261 /* loding 8-bit 16 pixels */
1262 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
1263 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
1264 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
1265 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/
1266
1267 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1268 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1269 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1270 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1271
1272 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
1273 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
1274 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
1275 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
1276
1277 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1278 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1279 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1280 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1281 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1282
1283 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1284 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1285 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1286 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1287 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1288
1289 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1290 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
1291 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
1292 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
1293 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
1294
1295 /* converting 16 bit to 8 bit */
1296 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1297 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1298 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1299 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1300
1301 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
1302 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
1303 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
1304 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
1305
1306 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1307 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1308
1309 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1310 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1311
1312 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/
1313
1314 src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1315 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/
1316
1317 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/
1318
1319 src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1320 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/
1321
1322 }
1323 }
1324 }
1325 else
1326 {
1327 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1328 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1329 const_temp2_4x32b = _mm_set1_epi16(31);
1330 const_temp4_4x32b = _mm_set1_epi16(8);
1331 two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
1332
1333 for(col = 0; col < 2 * nt; col += 16)
1334 {
1335 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1336 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
1337 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
1338
1339 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1340 __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
1341
1342 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1343 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
1344
1345 /* pos = ((row + 1) * intra_pred_ang); */
1346 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1347
1348 /* fract = pos & (31); */
1349 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1350
1351 ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5);
1352
1353 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
1354
1355 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
1356
1357 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1358
1359 /*(32 - fract) */
1360 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1361
1362 _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1363 _mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10);
1364
1365 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
1366 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
1367 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
1368 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
1369
1370 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
1371 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
1372 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
1373 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
1374
1375 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1376 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1377 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1378 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1379
1380 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
1381 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
1382 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
1383 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
1384
1385 fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/
1386 fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/
1387 fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/
1388 fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/
1389
1390 temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/
1391 temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/
1392 temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/
1393 temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/
1394
1395 temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
1396 temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
1397 temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
1398 temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
1399
1400 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
1401 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
1402 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
1403 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
1404
1405 for(row = 0; row < nt; row += 4)
1406 {
1407 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1408 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1409
1410 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
1411 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
1412
1413 /* loding 8-bit 16 pixels */
1414 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
1415 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
1416 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
1417 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/
1418
1419 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1420 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1421 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1422 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1423
1424 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
1425 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
1426 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
1427 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
1428
1429 /* loding 8-bit 16 pixels */
1430 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
1431 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
1432 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
1433 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/
1434
1435 src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
1436 src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
1437 src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
1438 src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
1439
1440 src_temp11_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
1441 src_temp12_8x16b = _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
1442 src_temp13_8x16b = _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
1443 src_temp14_8x16b = _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/
1444
1445 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1446 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1447 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1448 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1449 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1450
1451 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1452 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
1453 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
1454 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
1455 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
1456
1457 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1458 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1459 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1460 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1461 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1462
1463 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1464 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
1465 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
1466 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
1467 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
1468
1469 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1470 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
1471 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
1472 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
1473 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
1474
1475 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1476 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
1477 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
1478 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
1479 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
1480
1481 /* converting 16 bit to 8 bit */
1482 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1483 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1484 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1485 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1486
1487 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
1488 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
1489 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
1490 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
1491
1492 /* converting 16 bit to 8 bit */
1493 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
1494 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
1495 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
1496 src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
1497
1498 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
1499 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
1500 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
1501 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);
1502
1503 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1504 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1505
1506 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1507 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1508
1509 src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
1510 src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
1511
1512 src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
1513 src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
1514
1515 src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
1516 src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
1517 src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
1518 src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
1519
1520 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/
1521 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/
1522 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/
1523 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/
1524
1525 }
1526 }
1527 }
1528}
1529
1530/**
1531*******************************************************************************
1532*
1533* @brief
1534* Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
1535* with reference neighboring samples location pointed by 'pu1_ref' to the
1536* TU block location pointed by 'pu1_dst'
1537*
1538* @par Description:
1539*
1540*
1541* @param[in] pu1_src
1542* UWORD8 pointer to the source
1543*
1544* @param[in] pu1_dst
1545* UWORD8 pointer to the destination
1546*
1547* @param[in] src_strd
1548* integer source stride
1549*
1550* @param[in] dst_strd
1551* integer destination stride
1552*
1553* @param[in] nt
1554* integer Transform Block size
1555*
1556* @param[in] mode
1557* integer intraprediction mode
1558*
1559* @returns
1560*
1561* @remarks
1562* None
1563*
1564*******************************************************************************
1565*/
1566
1567
1568void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
1569 WORD32 src_strd,
1570 UWORD8 *pu1_dst,
1571 WORD32 dst_strd,
1572 WORD32 nt,
1573 WORD32 mode)
1574{
1575 /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
1576 /* for ref main & side samples assignment,can be combined for */
1577 /* optimzation*/
1578
1579 WORD32 row, col, k;
1580 WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
1581 WORD32 ref_idx;
1582
1583
1584 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
1585 __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
1586 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;
1587
1588 UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
1589 UWORD8 *ref_main;
1590 UNUSED(src_strd);
1591
1592 inv_ang_sum = 128;
1593
1594 intra_pred_ang = gai4_ihevc_ang_table[mode];
1595
1596 inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
1597 /* Intermediate reference samples for negative angle modes */
1598 /* This have to be removed during optimization*/
1599
1600 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
1601
1602
1603 ref_main = ref_temp + 2 * nt;
1604 for(k = 0; k < (2 * (nt + 1)); k += 2)
1605 {
1606 ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
1607 ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
1608 }
1609
1610 ref_main = ref_temp + (2 * (nt - 1));
1611 ref_idx = (nt * intra_pred_ang) >> 5;
1612
1613 /* SIMD Optimization can be done using look-up table for the loop */
1614 /* For negative angled derive the main reference samples from side */
1615 /* reference samples refer to section 8.4.4.2.6 */
1616
1617 for(k = -2; k > (2 * ref_idx); k -= 2)
1618 {
1619 inv_ang_sum += inv_ang;
1620 ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
1621 ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
1622 }
1623
1624 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
1625 /* samples dependent on distance to obtain destination sample */
1626
1627 const_temp_4x32b = _mm_set1_epi16(16);
1628 const_temp2_4x32b = _mm_set1_epi32(31);
1629 const_temp3_4x32b = _mm_set1_epi16(32);
1630 const_temp4_4x32b = _mm_set1_epi32(4);
1631
1632 two_nt_4x32b = _mm_set1_epi32(1);
1633
1634 zero_8x16b = _mm_set1_epi16(0);
1635
1636
1637 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
1638 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
1639
1640 row_4x32b = _mm_set_epi32(4, 3, 2, 1);
1641
1642 if(nt == 4)
1643 {
1644 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1645 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1646 const_temp2_4x32b = _mm_set1_epi16(31);
1647 const_temp4_4x32b = _mm_set1_epi16(4);
1648 two_nt_4x32b = _mm_set1_epi16(1);
1649
1650 {
1651 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1652 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
1653
1654 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1655 __m128i src_values10;
1656
1657 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1658
1659 /* pos = ((row + 1) * intra_pred_ang); */
1660 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1661
1662 /* fract = pos & (31); */
1663 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1664
1665 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
1666 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
1667
1668 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1669
1670 /*(32 - fract) */
1671 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1672
1673 _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1674 _mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10);
1675
1676 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
1677 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
1678 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
1679 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
1680
1681 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
1682 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
1683 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
1684 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
1685
1686 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1687 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1688 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1689 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1690
1691 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
1692 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
1693 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
1694 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
1695
1696 {
1697 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1698 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1699
1700 /* loding 8-bit 16 pixels */
1701 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
1702 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
1703 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
1704 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
1705
1706 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1707 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1708 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1709 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1710
1711 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
1712 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
1713 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
1714 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
1715
1716 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1717 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1718 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1719 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1720 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1721
1722 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1723 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1724 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1725 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1726 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1727
1728 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1729 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
1730 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
1731 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
1732 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
1733
1734 /* converting 16 bit to 8 bit */
1735 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1736 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1737 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1738 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1739
1740 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1741 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1742
1743 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1744 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1745
1746 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/
1747
1748 src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1749 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/
1750
1751 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/
1752
1753 src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
1754 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/
1755
1756 }
1757 }
1758 }
1759 else
1760 {
1761 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
1762 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
1763 const_temp2_4x32b = _mm_set1_epi16(31);
1764 const_temp4_4x32b = _mm_set1_epi16(8);
1765 two_nt_4x32b = _mm_set1_epi16(1);
1766
1767 for(col = 0; col < 2 * nt; col += 16)
1768 {
1769 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
1770 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
1771 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16];
1772
1773 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
1774 __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
1775
1776 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
1777 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
1778
1779 /* pos = ((row + 1) * intra_pred_ang); */
1780 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
1781
1782 /* fract = pos & (31); */
1783 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
1784
1785 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5));
1786 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
1787
1788 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
1789
1790 /*(32 - fract) */
1791 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
1792
1793 _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
1794 _mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10);
1795
1796 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/
1797 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/
1798 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/
1799 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/
1800
1801 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/
1802 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/
1803 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/
1804 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/
1805
1806 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
1807 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
1808 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
1809 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
1810
1811 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/
1812 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/
1813 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/
1814 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/
1815
1816 fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/
1817 fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/
1818 fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/
1819 fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/
1820
1821 temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/
1822 temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/
1823 temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/
1824 temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/
1825
1826 temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
1827 temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
1828 temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
1829 temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
1830
1831 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/
1832 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/
1833 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/
1834 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/
1835
1836 for(row = 0; row < nt; row += 4)
1837 {
1838 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1839 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1840
1841 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
1842 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
1843
1844 /* loding 8-bit 16 pixels */
1845 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
1846 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
1847 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
1848 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/
1849
1850 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
1851 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
1852 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
1853 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
1854
1855 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
1856 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
1857 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
1858 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
1859
1860 /* loding 8-bit 16 pixels */
1861 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
1862 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
1863 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
1864 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/
1865
1866 src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
1867 src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
1868 src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
1869 src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
1870
1871 src_temp11_8x16b = _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
1872 src_temp12_8x16b = _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
1873 src_temp13_8x16b = _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
1874 src_temp14_8x16b = _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/
1875
1876 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1877 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
1878 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
1879 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
1880 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
1881
1882 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
1883 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
1884 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
1885 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
1886 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
1887
1888 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1889 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
1890 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
1891 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
1892 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
1893
1894 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1895 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/
1896 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/
1897 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/
1898 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/
1899
1900 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
1901 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
1902 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
1903 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
1904 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
1905
1906 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
1907 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/
1908 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/
1909 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/
1910 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/
1911
1912 /* converting 16 bit to 8 bit */
1913 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
1914 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
1915 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
1916 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
1917
1918 /* converting 16 bit to 8 bit */
1919 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
1920 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
1921 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
1922 src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
1923
1924 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
1925 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
1926
1927 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
1928 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
1929
1930 src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
1931 src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
1932
1933 src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
1934 src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
1935
1936 src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
1937 src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
1938 src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
1939 src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
1940
1941 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/
1942 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/
1943 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/
1944 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/
1945
1946 }
1947 }
1948 }
1949}
1950
1951/**
1952*******************************************************************************
1953*
1954* @brief
1955* Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
1956* reference neighboring samples location pointed by 'pu1_ref' to the TU
1957* block location pointed by 'pu1_dst'
1958*
1959* @par Description:
1960*
1961*
1962* @param[in] pu1_src
1963* UWORD8 pointer to the source
1964*
1965* @param[in] pu1_dst
1966* UWORD8 pointer to the destination
1967*
1968* @param[in] src_strd
1969* integer source stride
1970*
1971* @param[in] dst_strd
1972* integer destination stride
1973*
1974* @param[in] nt
1975* integer Transform Block size
1976*
1977* @param[in] mode
1978* integer intraprediction mode
1979*
1980* @returns
1981*
1982* @remarks
1983* None
1984*
1985*******************************************************************************
1986*/
1987
1988void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
1989 WORD32 src_strd,
1990 UWORD8 *pu1_dst,
1991 WORD32 dst_strd,
1992 WORD32 nt,
1993 WORD32 mode)
1994{
1995 WORD32 row, k;
1996 WORD32 intra_pred_ang, idx;
1997 WORD32 inv_ang, inv_ang_sum, pos, fract;
1998 WORD32 ref_main_idx, ref_idx;
1999 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
2000 UWORD8 *ref_main;
2001
2002 __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
2003 UNUSED(src_strd);
2004
2005 intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
2006 inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
2007
2008 /* Intermediate reference samples for negative angle modes */
2009 /* This have to be removed during optimization*/
2010 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2011 ref_main = ref_temp + 2 * nt;
2012 for(k = 0; k < (2 * (nt + 1)); k += 2)
2013 {
2014 ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
2015 ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
2016 }
2017
2018 ref_idx = (nt * intra_pred_ang) >> 5;
2019 inv_ang_sum = 128;
2020 ref_main = ref_temp + (2 * (nt - 1));
2021 /* SIMD Optimization can be done using look-up table for the loop */
2022 /* For negative angled derive the main reference samples from side */
2023 /* reference samples refer to section 8.4.4.2.6 */
2024 for(k = -2; k > (2 * ref_idx); k -= 2)
2025 {
2026 inv_ang_sum += inv_ang;
2027 ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
2028 ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
2029 }
2030
2031 const_temp_8x16b = _mm_set1_epi16(16);
2032
2033 if(nt == 4) /* if nt =4*/
2034 {
2035 __m128i const_temp2_4x32b, const_temp3_4x32b;
2036 __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
2037 __m128i row_4x32b, two_nt_4x32b, src_values12;
2038
2039
2040 const_temp2_4x32b = _mm_set1_epi32(31);
2041 const_temp3_4x32b = _mm_set1_epi32(32);
2042
2043 two_nt_4x32b = _mm_set1_epi32(2);
2044
2045 zero_8x16b = _mm_set1_epi16(0);
2046
2047 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
2048 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
2049
2050 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
2051 {
2052 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
2053 WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16];
2054
2055 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
2056 __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
2057 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
2058 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
2059
2060 /* pos = ((row + 1) * intra_pred_ang); */
2061 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
2062 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
2063 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
2064
2065 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
2066 src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5));
2067
2068 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
2069 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
2070 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
2071 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
2072 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
2073 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
2074 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
2075
2076 /* fract = pos & (31); */
2077 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
2078
2079 /*(32 - fract) */
2080 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
2081
2082 _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
2083 _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
2084
2085 fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/
2086 fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/
2087 fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/
2088 fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/
2089
2090 temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/
2091 temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/
2092 temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/
2093 temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/
2094
2095 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2096 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
2097 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
2098 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
2099
2100// inner loop starts from here
2101 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
2102 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */
2103 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */
2104 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */
2105
2106 src_values10 = _mm_srli_si128(src_values0, 2);
2107 src_values11 = _mm_srli_si128(src_values1, 2);
2108 src_values12 = _mm_srli_si128(src_values2, 2);
2109 src_values13 = _mm_srli_si128(src_values3, 2);
2110
2111 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2112 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2113 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2114 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2115
2116 src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
2117 src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
2118 src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
2119 src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
2120
2121 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2122 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2123 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2124 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2125 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2126
2127 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2128 src_values0 = _mm_srai_epi16(src_values0, 5);
2129 src_values1 = _mm_srai_epi16(src_values1, 5);
2130 src_values2 = _mm_srai_epi16(src_values2, 5);
2131 src_values3 = _mm_srai_epi16(src_values3, 5);
2132
2133 /* converting 16 bit to 8 bit */
2134 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2135 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2136 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2137 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2138
2139 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/
2140 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/
2141 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/
2142 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/
2143
2144 }
2145 }
2146 else if(nt == 8) /* for nt = 16 case */
2147 {
2148 WORD32 ref_main_idx1, fract1, temp, temp1;
2149 __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
2150
2151 zero_8x16b = _mm_set1_epi16(0);
2152
2153 for(row = 0; row < nt; row += 2)
2154 {
2155 __m128i src_values0, src_values1, src_values2, src_values3;
2156 __m128i src_values10, src_values11, src_values12, src_values13;
2157
2158 pos = ((row + 1) * intra_pred_ang);
2159 idx = pos >> 5;
2160 fract = pos & (31);
2161 temp = 32 - fract;
2162 ref_main_idx = 2 * idx + 2; /* col from 0-15 */
2163
2164 pos = ((row + 2) * intra_pred_ang);
2165 idx = pos >> 5;
2166 fract1 = pos & (31);
2167 temp1 = 32 - fract1;
2168 ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */
2169
2170 fract_8x16b = _mm_set1_epi8(fract);
2171 fract1_8x16b = _mm_set1_epi8(fract1);
2172 temp_8x16b = _mm_set1_epi8(temp);
2173 temp1_8x16b = _mm_set1_epi8(temp1);
2174
2175 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2176 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2177
2178 /* row=0 */
2179 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */
2180 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */
2181
2182 /* row=1 */
2183 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */
2184 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8)); /* col = 8-15 */
2185
2186 src_values10 = _mm_srli_si128(src_values0, 2);
2187 src_values11 = _mm_srli_si128(src_values1, 2);
2188 src_values12 = _mm_srli_si128(src_values2, 2);
2189 src_values13 = _mm_srli_si128(src_values3, 2);
2190
2191 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2192 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2193 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2194 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2195
2196 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2197 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2198
2199 src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
2200 src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
2201
2202 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2203 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2204 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2205
2206 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2207 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2208
2209 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2210 src_values0 = _mm_srai_epi16(src_values0, 5);
2211 src_values1 = _mm_srai_epi16(src_values1, 5);
2212
2213 src_values2 = _mm_srai_epi16(src_values2, 5);
2214 src_values3 = _mm_srai_epi16(src_values3, 5);
2215
2216 /* converting 16 bit to 8 bit */
2217 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2218 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2219
2220 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2221 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2222
2223 /* loding 8-bit 8 pixels values */
2224 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2225 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2226
2227 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
2228 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
2229
2230 pu1_dst += 2 * dst_strd;
2231 }
2232 }
2233 else if(nt == 16)
2234 {
2235 WORD32 temp;
2236 /* unroll the col loop (inner) */
2237 zero_8x16b = _mm_set1_epi16(0);
2238
2239 for(row = 0; row < nt; row += 1)
2240 {
2241 __m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b;
2242 __m128i src_values10, src_values11, src_values12, src_values13;
2243
2244 pos = ((row + 1) * intra_pred_ang);
2245 idx = pos >> 5;
2246 fract = pos & (31);
2247 temp = 32 - fract;
2248 ref_main_idx = 2 * idx + 2; /* col from 0-31 */
2249
2250 fract_8x16b = _mm_set1_epi8(fract);
2251 temp_8x16b = _mm_set1_epi8(temp);
2252
2253 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2254
2255 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */
2256 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */
2257 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16)); /* col = 16-23 */
2258 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24)); /* col = 24-31 */
2259
2260 src_values10 = _mm_srli_si128(src_values0, 2);
2261 src_values11 = _mm_srli_si128(src_values1, 2);
2262 src_values12 = _mm_srli_si128(src_values2, 2);
2263 src_values13 = _mm_srli_si128(src_values3, 2);
2264
2265 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2266 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2267 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2268 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2269
2270 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2271 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2272 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2273 src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
2274 src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
2275
2276 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2277 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2278 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2279 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2280 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2281
2282 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2283 src_values0 = _mm_srai_epi16(src_values0, 5);
2284 src_values1 = _mm_srai_epi16(src_values1, 5);
2285 src_values2 = _mm_srai_epi16(src_values2, 5);
2286 src_values3 = _mm_srai_epi16(src_values3, 5);
2287
2288 /* converting 16 bit to 8 bit */
2289 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2290 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2291 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2292 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2293
2294 /* loding 8-bit 8 pixels values */
2295 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2296 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2297 _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
2298 _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
2299
2300 pu1_dst += dst_strd;
2301
2302 }
2303 }
2304}
2305
2306
2307/**
2308*******************************************************************************
2309*
2310* @brief
2311* Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
2312* reference neighboring samples location pointed by 'pu1_ref' to the TU
2313* block location pointed by 'pu1_dst'
2314*
2315* @par Description:
2316*
2317*
2318* @param[in] pu1_src
2319* UWORD8 pointer to the source
2320*
2321* @param[in] pu1_dst
2322* UWORD8 pointer to the destination
2323*
2324* @param[in] src_strd
2325* integer source stride
2326*
2327* @param[in] dst_strd
2328* integer destination stride
2329*
2330* @param[in] nt
2331* integer Transform Block size
2332*
2333* @param[in] mode
2334* integer intraprediction mode
2335*
2336* @returns
2337*
2338* @remarks
2339* None
2340*
2341*******************************************************************************
2342*/
2343
2344void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
2345 WORD32 src_strd,
2346 UWORD8 *pu1_dst,
2347 WORD32 dst_strd,
2348 WORD32 nt,
2349 WORD32 mode)
2350{
2351 WORD32 row;
2352 WORD32 pos, fract;
2353 WORD32 intra_pred_ang;
2354 WORD32 idx, ref_main_idx;
2355
2356 __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
2357 UNUSED(src_strd);
2358
2359 intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
2360 const_temp_8x16b = _mm_set1_epi16(16);
2361
2362 if(nt == 4) /* if nt =4*/
2363 {
2364 __m128i const_temp2_4x32b, const_temp3_4x32b;
2365 __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
2366 __m128i row_4x32b, two_nt_4x32b, src_values12;
2367
2368 const_temp2_4x32b = _mm_set1_epi32(31);
2369 const_temp3_4x32b = _mm_set1_epi32(32);
2370
2371 two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);
2372
2373 zero_8x16b = _mm_set1_epi16(0);
2374
2375 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
2376 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
2377 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
2378
2379 {
2380 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
2381 WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16];
2382
2383 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
2384 __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
2385 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
2386 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
2387
2388 /* pos = ((row + 1) * intra_pred_ang); */
2389 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
2390 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
2391 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
2392
2393 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5));
2394 src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5));
2395
2396 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */
2397 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */
2398 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
2399 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/
2400 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/
2401 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/
2402 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/
2403
2404 /* fract = pos & (31); */
2405 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
2406
2407 /*(32 - fract) */
2408 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
2409
2410 _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
2411 _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
2412
2413 fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/
2414 fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/
2415 fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/
2416 fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/
2417
2418 temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/
2419 temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/
2420 temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/
2421 temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/
2422
2423 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2424 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
2425 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
2426 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
2427
2428// inner loop starts from here
2429 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
2430 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */
2431 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */
2432 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */
2433
2434 src_values10 = _mm_srli_si128(src_values0, 2);
2435 src_values11 = _mm_srli_si128(src_values1, 2);
2436 src_values12 = _mm_srli_si128(src_values2, 2);
2437 src_values13 = _mm_srli_si128(src_values3, 2);
2438
2439 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2440 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2441 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2442 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2443
2444 src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
2445 src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
2446 src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
2447 src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
2448
2449 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2450 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2451 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2452 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2453 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2454
2455 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2456 src_values0 = _mm_srai_epi16(src_values0, 5);
2457 src_values1 = _mm_srai_epi16(src_values1, 5);
2458 src_values2 = _mm_srai_epi16(src_values2, 5);
2459 src_values3 = _mm_srai_epi16(src_values3, 5);
2460
2461 /* converting 16 bit to 8 bit */
2462 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2463 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2464 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2465 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2466
2467 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/
2468 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/
2469 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/
2470 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/
2471
2472 }
2473 }
2474
2475 else if(nt == 8) /* for nt = 16 case */
2476 {
2477 WORD32 ref_main_idx1, fract1, temp, temp1;
2478 __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
2479
2480 zero_8x16b = _mm_set1_epi16(0);
2481
2482 for(row = 0; row < nt; row += 2)
2483 {
2484 __m128i src_values0, src_values1, src_values2, src_values3;
2485 __m128i src_values10, src_values11, src_values12, src_values13;
2486
2487 pos = ((row + 1) * intra_pred_ang);
2488 idx = pos >> 5;
2489 fract = pos & (31);
2490 temp = 32 - fract;
2491 ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
2492
2493 pos = ((row + 2) * intra_pred_ang);
2494 idx = pos >> 5;
2495 fract1 = pos & (31);
2496 temp1 = 32 - fract1;
2497 ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
2498
2499 fract_8x16b = _mm_set1_epi8(fract);
2500 fract1_8x16b = _mm_set1_epi8(fract1);
2501 temp_8x16b = _mm_set1_epi8(temp);
2502 temp1_8x16b = _mm_set1_epi8(temp1);
2503
2504 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2505 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
2506
2507 /* row=0 */
2508 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */
2509 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */
2510
2511 /* row=1 */
2512 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */
2513 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8)); /* col = 8-15 */
2514
2515 src_values10 = _mm_srli_si128(src_values0, 2);
2516 src_values11 = _mm_srli_si128(src_values1, 2);
2517 src_values12 = _mm_srli_si128(src_values2, 2);
2518 src_values13 = _mm_srli_si128(src_values3, 2);
2519
2520 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2521 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2522 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2523 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2524
2525 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2526 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2527
2528 src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
2529 src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
2530
2531 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2532 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2533 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2534
2535 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2536 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2537
2538 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2539 src_values0 = _mm_srai_epi16(src_values0, 5);
2540 src_values1 = _mm_srai_epi16(src_values1, 5);
2541
2542 src_values2 = _mm_srai_epi16(src_values2, 5);
2543 src_values3 = _mm_srai_epi16(src_values3, 5);
2544
2545 /* converting 16 bit to 8 bit */
2546 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2547 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2548
2549 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2550 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2551
2552 /* loding 8-bit 8 pixels values */
2553 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2554 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2555
2556 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
2557 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
2558
2559 pu1_dst += 2 * dst_strd;
2560 }
2561 }
2562 else if(nt == 16)
2563 {
2564 WORD32 temp;
2565 /* unroll the col loop (inner) */
2566 zero_8x16b = _mm_set1_epi16(0);
2567
2568 for(row = 0; row < nt; row += 1)
2569 {
2570 __m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b;
2571 __m128i src_values10, src_values11, src_values12, src_values13;
2572
2573 pos = ((row + 1) * intra_pred_ang);
2574 idx = pos >> 5;
2575 fract = pos & (31);
2576 temp = 32 - fract;
2577 ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */
2578
2579 fract_8x16b = _mm_set1_epi8(fract);
2580 temp_8x16b = _mm_set1_epi8(temp);
2581
2582 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
2583
2584 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */
2585 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */
2586 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16)); /* col = 16-23 */
2587 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24)); /* col = 24-31 */
2588
2589 src_values10 = _mm_srli_si128(src_values0, 2);
2590 src_values11 = _mm_srli_si128(src_values1, 2);
2591 src_values12 = _mm_srli_si128(src_values2, 2);
2592 src_values13 = _mm_srli_si128(src_values3, 2);
2593
2594 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
2595 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
2596 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
2597 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
2598
2599 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
2600 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
2601 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
2602 src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
2603 src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
2604
2605 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
2606 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
2607 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
2608 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
2609 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
2610
2611 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
2612 src_values0 = _mm_srai_epi16(src_values0, 5);
2613 src_values1 = _mm_srai_epi16(src_values1, 5);
2614 src_values2 = _mm_srai_epi16(src_values2, 5);
2615 src_values3 = _mm_srai_epi16(src_values3, 5);
2616
2617 /* converting 16 bit to 8 bit */
2618 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
2619 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
2620 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
2621 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
2622
2623 /* loding 8-bit 8 pixels values */
2624 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
2625 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
2626 _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
2627 _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
2628
2629 pu1_dst += dst_strd;
2630
2631 }
2632 }
2633}