blob: 56e28a342df50fb6b8acdebd55d0bef7114f53a0 [file] [log] [blame]
Harish Mahendrakar0d8951c2014-05-16 10:31:13 -07001/******************************************************************************
2*
3* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4*
5* Licensed under the Apache License, Version 2.0 (the "License");
6* you may not use this file except in compliance with the License.
7* You may obtain a copy of the License at:
8*
9* http://www.apache.org/licenses/LICENSE-2.0
10*
11* Unless required by applicable law or agreed to in writing, software
12* distributed under the License is distributed on an "AS IS" BASIS,
13* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14* See the License for the specific language governing permissions and
15* limitations under the License.
16*
17******************************************************************************/
18/**
19 *******************************************************************************
20 * @file
21 * ihevc_itrans_recon_16x16.c
22 *
23 * @brief
24 * Contains function definitions for inverse transform and reconstruction 16x16
25 *
26 *
27 * @author
28 * 100470
29 *
30 * @par List of Functions:
31 * - ihevc_itrans_recon_16x16()
32 *
33 * @remarks
34 * None
35 *
36 *******************************************************************************
37 */
38#include <stdio.h>
39#include <string.h>
40#include "ihevc_typedefs.h"
41#include "ihevc_macros.h"
42#include "ihevc_platform_macros.h"
43#include "ihevc_defs.h"
44#include "ihevc_trans_tables.h"
45#include "ihevc_itrans_recon.h"
46#include "ihevc_func_selector.h"
47#include "ihevc_trans_macros.h"
48
49/**
50 *******************************************************************************
51 *
52 * @brief
53 * This function performs Inverse transform and reconstruction for 16x16
54 * input block
55 *
56 * @par Description:
57 * Performs inverse transform and adds the prediction data and clips output
58 * to 8 bit
59 *
60 * @param[in] pi2_src
61 * Input 16x16 coefficients
62 *
63 * @param[in] pi2_tmp
64 * Temporary 16x16 buffer for storing inverse
65 *
66 * transform
67 * 1st stage output
68 *
69 * @param[in] pu1_pred
70 * Prediction 16x16 block
71 *
72 * @param[out] pu1_dst
73 * Output 16x16 block
74 *
75 * @param[in] src_strd
76 * Input stride
77 *
78 * @param[in] pred_strd
79 * Prediction stride
80 *
81 * @param[in] dst_strd
82 * Output Stride
83 *
84 * @param[in] shift
85 * Output shift
86 *
87 * @param[in] zero_cols
88 * Zero columns in pi2_src
89 *
90 * @returns Void
91 *
92 * @remarks
93 * None
94 *
95 *******************************************************************************
96 */
97
98void ihevc_itrans_recon_16x16(WORD16 *pi2_src,
99 WORD16 *pi2_tmp,
100 UWORD8 *pu1_pred,
101 UWORD8 *pu1_dst,
102 WORD32 src_strd,
103 WORD32 pred_strd,
104 WORD32 dst_strd,
105 WORD32 zero_cols,
106 WORD32 zero_rows)
107{
108 WORD32 j, k;
109 WORD32 e[8], o[8];
110 WORD32 ee[4], eo[4];
111 WORD32 eee[2], eeo[2];
112 WORD32 add;
113 WORD32 shift;
114 WORD16 *pi2_tmp_orig;
115 WORD32 trans_size;
116 WORD32 zero_rows_2nd_stage = zero_cols;
117 WORD32 row_limit_2nd_stage;
118
119 if((zero_cols & 0xFFF0) == 0xFFF0)
120 row_limit_2nd_stage = 4;
121 else if((zero_cols & 0xFF00) == 0xFF00)
122 row_limit_2nd_stage = 8;
123 else
124 row_limit_2nd_stage = TRANS_SIZE_16;
125
126 trans_size = TRANS_SIZE_16;
127 pi2_tmp_orig = pi2_tmp;
128 if((zero_rows & 0xFFF0) == 0xFFF0) /* First 4 rows of input are non-zero */
129 {
130 /* Inverse Transform 1st stage */
131 /************************************************************************************************/
132 /**********************************START - IT_RECON_16x16****************************************/
133 /************************************************************************************************/
134
135 shift = IT_SHIFT_STAGE_1;
136 add = 1 << (shift - 1);
137
138 for(j = 0; j < row_limit_2nd_stage; j++)
139 {
140 /* Checking for Zero Cols */
141 if((zero_cols & 1) == 1)
142 {
143 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
144 }
145 else
146 {
147 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
148 for(k = 0; k < 8; k++)
149 {
150 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
151 + g_ai2_ihevc_trans_16[3][k]
152 * pi2_src[3 * src_strd];
153 }
154 for(k = 0; k < 4; k++)
155 {
156 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd];
157 }
158 eeo[0] = 0;
159 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
160 eeo[1] = 0;
161 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
162
163 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
164 for(k = 0; k < 2; k++)
165 {
166 ee[k] = eee[k] + eeo[k];
167 ee[k + 2] = eee[1 - k] - eeo[1 - k];
168 }
169 for(k = 0; k < 4; k++)
170 {
171 e[k] = ee[k] + eo[k];
172 e[k + 4] = ee[3 - k] - eo[3 - k];
173 }
174 for(k = 0; k < 8; k++)
175 {
176 pi2_tmp[k] =
177 CLIP_S16(((e[k] + o[k] + add) >> shift));
178 pi2_tmp[k + 8] =
179 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
180 }
181 }
182 pi2_src++;
183 pi2_tmp += trans_size;
184 zero_cols = zero_cols >> 1;
185 }
186
187 pi2_tmp = pi2_tmp_orig;
188
189 /* Inverse Transform 2nd stage */
190 shift = IT_SHIFT_STAGE_2;
191 add = 1 << (shift - 1);
192
193 if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
194 {
195 for(j = 0; j < trans_size; j++)
196 {
197 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
198 for(k = 0; k < 8; k++)
199 {
200 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
201 + g_ai2_ihevc_trans_16[3][k]
202 * pi2_tmp[3 * trans_size];
203 }
204 for(k = 0; k < 4; k++)
205 {
206 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
207 }
208 eeo[0] = 0;
209 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
210 eeo[1] = 0;
211 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
212
213 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
214 for(k = 0; k < 2; k++)
215 {
216 ee[k] = eee[k] + eeo[k];
217 ee[k + 2] = eee[1 - k] - eeo[1 - k];
218 }
219 for(k = 0; k < 4; k++)
220 {
221 e[k] = ee[k] + eo[k];
222 e[k + 4] = ee[3 - k] - eo[3 - k];
223 }
224 for(k = 0; k < 8; k++)
225 {
226 WORD32 itrans_out;
227 itrans_out =
228 CLIP_S16(((e[k] + o[k] + add) >> shift));
229 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
230 itrans_out =
231 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
232 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
233 }
234 pi2_tmp++;
235 pu1_pred += pred_strd;
236 pu1_dst += dst_strd;
237 }
238 }
239 else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
240 {
241 for(j = 0; j < trans_size; j++)
242 {
243 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
244 for(k = 0; k < 8; k++)
245 {
246 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
247 + g_ai2_ihevc_trans_16[3][k]
248 * pi2_tmp[3 * trans_size]
249 + g_ai2_ihevc_trans_16[5][k]
250 * pi2_tmp[5 * trans_size]
251 + g_ai2_ihevc_trans_16[7][k]
252 * pi2_tmp[7 * trans_size];
253 }
254 for(k = 0; k < 4; k++)
255 {
256 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
257 + g_ai2_ihevc_trans_16[6][k]
258 * pi2_tmp[6 * trans_size];
259 }
260 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
261 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
262 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
263 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
264
265 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
266 for(k = 0; k < 2; k++)
267 {
268 ee[k] = eee[k] + eeo[k];
269 ee[k + 2] = eee[1 - k] - eeo[1 - k];
270 }
271 for(k = 0; k < 4; k++)
272 {
273 e[k] = ee[k] + eo[k];
274 e[k + 4] = ee[3 - k] - eo[3 - k];
275 }
276 for(k = 0; k < 8; k++)
277 {
278 WORD32 itrans_out;
279 itrans_out =
280 CLIP_S16(((e[k] + o[k] + add) >> shift));
281 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
282 itrans_out =
283 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
284 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
285 }
286 pi2_tmp++;
287 pu1_pred += pred_strd;
288 pu1_dst += dst_strd;
289 }
290 }
291 else /* All rows of output of 1st stage are non-zero */
292 {
293 for(j = 0; j < trans_size; j++)
294 {
295 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
296 for(k = 0; k < 8; k++)
297 {
298 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
299 + g_ai2_ihevc_trans_16[3][k]
300 * pi2_tmp[3 * trans_size]
301 + g_ai2_ihevc_trans_16[5][k]
302 * pi2_tmp[5 * trans_size]
303 + g_ai2_ihevc_trans_16[7][k]
304 * pi2_tmp[7 * trans_size]
305 + g_ai2_ihevc_trans_16[9][k]
306 * pi2_tmp[9 * trans_size]
307 + g_ai2_ihevc_trans_16[11][k]
308 * pi2_tmp[11 * trans_size]
309 + g_ai2_ihevc_trans_16[13][k]
310 * pi2_tmp[13 * trans_size]
311 + g_ai2_ihevc_trans_16[15][k]
312 * pi2_tmp[15 * trans_size];
313 }
314 for(k = 0; k < 4; k++)
315 {
316 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
317 + g_ai2_ihevc_trans_16[6][k]
318 * pi2_tmp[6 * trans_size]
319 + g_ai2_ihevc_trans_16[10][k]
320 * pi2_tmp[10 * trans_size]
321 + g_ai2_ihevc_trans_16[14][k]
322 * pi2_tmp[14 * trans_size];
323 }
324 eeo[0] =
325 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
326 + g_ai2_ihevc_trans_16[12][0]
327 * pi2_tmp[12
328 * trans_size];
329 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
330 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
331 eeo[1] =
332 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
333 + g_ai2_ihevc_trans_16[12][1]
334 * pi2_tmp[12
335 * trans_size];
336 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
337 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
338
339 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
340 for(k = 0; k < 2; k++)
341 {
342 ee[k] = eee[k] + eeo[k];
343 ee[k + 2] = eee[1 - k] - eeo[1 - k];
344 }
345 for(k = 0; k < 4; k++)
346 {
347 e[k] = ee[k] + eo[k];
348 e[k + 4] = ee[3 - k] - eo[3 - k];
349 }
350 for(k = 0; k < 8; k++)
351 {
352 WORD32 itrans_out;
353 itrans_out =
354 CLIP_S16(((e[k] + o[k] + add) >> shift));
355 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
356 itrans_out =
357 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
358 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
359 }
360 pi2_tmp++;
361 pu1_pred += pred_strd;
362 pu1_dst += dst_strd;
363 }
364 }
365 /************************************************************************************************/
366 /************************************END - IT_RECON_16x16****************************************/
367 /************************************************************************************************/
368 }
369 else if((zero_rows & 0xFF00) == 0xFF00) /* First 8 rows of input are non-zero */
370 {
371 /* Inverse Transform 1st stage */
372 /************************************************************************************************/
373 /**********************************START - IT_RECON_16x16****************************************/
374 /************************************************************************************************/
375
376 shift = IT_SHIFT_STAGE_1;
377 add = 1 << (shift - 1);
378
379 for(j = 0; j < row_limit_2nd_stage; j++)
380 {
381 /* Checking for Zero Cols */
382 if((zero_cols & 1) == 1)
383 {
384 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
385 }
386 else
387 {
388 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
389 for(k = 0; k < 8; k++)
390 {
391 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
392 + g_ai2_ihevc_trans_16[3][k]
393 * pi2_src[3 * src_strd]
394 + g_ai2_ihevc_trans_16[5][k]
395 * pi2_src[5 * src_strd]
396 + g_ai2_ihevc_trans_16[7][k]
397 * pi2_src[7 * src_strd];
398 }
399 for(k = 0; k < 4; k++)
400 {
401 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
402 + g_ai2_ihevc_trans_16[6][k]
403 * pi2_src[6 * src_strd];
404 }
405 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd];
406 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_src[0];
407 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd];
408 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_src[0];
409
410 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
411 for(k = 0; k < 2; k++)
412 {
413 ee[k] = eee[k] + eeo[k];
414 ee[k + 2] = eee[1 - k] - eeo[1 - k];
415 }
416 for(k = 0; k < 4; k++)
417 {
418 e[k] = ee[k] + eo[k];
419 e[k + 4] = ee[3 - k] - eo[3 - k];
420 }
421 for(k = 0; k < 8; k++)
422 {
423 pi2_tmp[k] =
424 CLIP_S16(((e[k] + o[k] + add) >> shift));
425 pi2_tmp[k + 8] =
426 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
427 }
428 }
429 pi2_src++;
430 pi2_tmp += trans_size;
431 zero_cols = zero_cols >> 1;
432 }
433
434 pi2_tmp = pi2_tmp_orig;
435
436 /* Inverse Transform 2nd stage */
437 shift = IT_SHIFT_STAGE_2;
438 add = 1 << (shift - 1);
439
440 if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
441 {
442 for(j = 0; j < trans_size; j++)
443 {
444 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
445 for(k = 0; k < 8; k++)
446 {
447 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
448 + g_ai2_ihevc_trans_16[3][k]
449 * pi2_tmp[3 * trans_size];
450 }
451 for(k = 0; k < 4; k++)
452 {
453 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
454 }
455 eeo[0] = 0;
456 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
457 eeo[1] = 0;
458 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
459
460 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
461 for(k = 0; k < 2; k++)
462 {
463 ee[k] = eee[k] + eeo[k];
464 ee[k + 2] = eee[1 - k] - eeo[1 - k];
465 }
466 for(k = 0; k < 4; k++)
467 {
468 e[k] = ee[k] + eo[k];
469 e[k + 4] = ee[3 - k] - eo[3 - k];
470 }
471 for(k = 0; k < 8; k++)
472 {
473 WORD32 itrans_out;
474 itrans_out =
475 CLIP_S16(((e[k] + o[k] + add) >> shift));
476 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
477 itrans_out =
478 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
479 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
480 }
481 pi2_tmp++;
482 pu1_pred += pred_strd;
483 pu1_dst += dst_strd;
484 }
485 }
486 else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
487 {
488 for(j = 0; j < trans_size; j++)
489 {
490 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
491 for(k = 0; k < 8; k++)
492 {
493 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
494 + g_ai2_ihevc_trans_16[3][k]
495 * pi2_tmp[3 * trans_size]
496 + g_ai2_ihevc_trans_16[5][k]
497 * pi2_tmp[5 * trans_size]
498 + g_ai2_ihevc_trans_16[7][k]
499 * pi2_tmp[7 * trans_size];
500 }
501 for(k = 0; k < 4; k++)
502 {
503 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
504 + g_ai2_ihevc_trans_16[6][k]
505 * pi2_tmp[6 * trans_size];
506 }
507 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
508 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
509 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
510 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
511
512 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
513 for(k = 0; k < 2; k++)
514 {
515 ee[k] = eee[k] + eeo[k];
516 ee[k + 2] = eee[1 - k] - eeo[1 - k];
517 }
518 for(k = 0; k < 4; k++)
519 {
520 e[k] = ee[k] + eo[k];
521 e[k + 4] = ee[3 - k] - eo[3 - k];
522 }
523 for(k = 0; k < 8; k++)
524 {
525 WORD32 itrans_out;
526 itrans_out =
527 CLIP_S16(((e[k] + o[k] + add) >> shift));
528 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
529 itrans_out =
530 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
531 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
532 }
533 pi2_tmp++;
534 pu1_pred += pred_strd;
535 pu1_dst += dst_strd;
536 }
537 }
538 else /* All rows of output of 1st stage are non-zero */
539 {
540 for(j = 0; j < trans_size; j++)
541 {
542 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
543 for(k = 0; k < 8; k++)
544 {
545 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
546 + g_ai2_ihevc_trans_16[3][k]
547 * pi2_tmp[3 * trans_size]
548 + g_ai2_ihevc_trans_16[5][k]
549 * pi2_tmp[5 * trans_size]
550 + g_ai2_ihevc_trans_16[7][k]
551 * pi2_tmp[7 * trans_size]
552 + g_ai2_ihevc_trans_16[9][k]
553 * pi2_tmp[9 * trans_size]
554 + g_ai2_ihevc_trans_16[11][k]
555 * pi2_tmp[11 * trans_size]
556 + g_ai2_ihevc_trans_16[13][k]
557 * pi2_tmp[13 * trans_size]
558 + g_ai2_ihevc_trans_16[15][k]
559 * pi2_tmp[15 * trans_size];
560 }
561 for(k = 0; k < 4; k++)
562 {
563 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
564 + g_ai2_ihevc_trans_16[6][k]
565 * pi2_tmp[6 * trans_size]
566 + g_ai2_ihevc_trans_16[10][k]
567 * pi2_tmp[10 * trans_size]
568 + g_ai2_ihevc_trans_16[14][k]
569 * pi2_tmp[14 * trans_size];
570 }
571 eeo[0] =
572 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
573 + g_ai2_ihevc_trans_16[12][0]
574 * pi2_tmp[12
575 * trans_size];
576 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
577 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
578 eeo[1] =
579 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
580 + g_ai2_ihevc_trans_16[12][1]
581 * pi2_tmp[12
582 * trans_size];
583 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
584 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
585
586 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
587 for(k = 0; k < 2; k++)
588 {
589 ee[k] = eee[k] + eeo[k];
590 ee[k + 2] = eee[1 - k] - eeo[1 - k];
591 }
592 for(k = 0; k < 4; k++)
593 {
594 e[k] = ee[k] + eo[k];
595 e[k + 4] = ee[3 - k] - eo[3 - k];
596 }
597 for(k = 0; k < 8; k++)
598 {
599 WORD32 itrans_out;
600 itrans_out =
601 CLIP_S16(((e[k] + o[k] + add) >> shift));
602 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
603 itrans_out =
604 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
605 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
606 }
607 pi2_tmp++;
608 pu1_pred += pred_strd;
609 pu1_dst += dst_strd;
610 }
611 }
612 /************************************************************************************************/
613 /************************************END - IT_RECON_16x16****************************************/
614 /************************************************************************************************/
615 }
616 else /* All rows of input are non-zero */
617 {
618 /* Inverse Transform 1st stage */
619 /************************************************************************************************/
620 /**********************************START - IT_RECON_16x16****************************************/
621 /************************************************************************************************/
622
623 shift = IT_SHIFT_STAGE_1;
624 add = 1 << (shift - 1);
625
626 for(j = 0; j < row_limit_2nd_stage; j++)
627 {
628 /* Checking for Zero Cols */
629 if((zero_cols & 1) == 1)
630 {
631 memset(pi2_tmp, 0, trans_size * sizeof(WORD16));
632 }
633 else
634 {
635 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
636 for(k = 0; k < 8; k++)
637 {
638 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_src[src_strd]
639 + g_ai2_ihevc_trans_16[3][k]
640 * pi2_src[3 * src_strd]
641 + g_ai2_ihevc_trans_16[5][k]
642 * pi2_src[5 * src_strd]
643 + g_ai2_ihevc_trans_16[7][k]
644 * pi2_src[7 * src_strd]
645 + g_ai2_ihevc_trans_16[9][k]
646 * pi2_src[9 * src_strd]
647 + g_ai2_ihevc_trans_16[11][k]
648 * pi2_src[11 * src_strd]
649 + g_ai2_ihevc_trans_16[13][k]
650 * pi2_src[13 * src_strd]
651 + g_ai2_ihevc_trans_16[15][k]
652 * pi2_src[15 * src_strd];
653 }
654 for(k = 0; k < 4; k++)
655 {
656 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_src[2 * src_strd]
657 + g_ai2_ihevc_trans_16[6][k]
658 * pi2_src[6 * src_strd]
659 + g_ai2_ihevc_trans_16[10][k]
660 * pi2_src[10 * src_strd]
661 + g_ai2_ihevc_trans_16[14][k]
662 * pi2_src[14 * src_strd];
663 }
664 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_src[4 * src_strd]
665 + g_ai2_ihevc_trans_16[12][0]
666 * pi2_src[12 * src_strd];
667 eee[0] =
668 g_ai2_ihevc_trans_16[0][0] * pi2_src[0]
669 + g_ai2_ihevc_trans_16[8][0]
670 * pi2_src[8
671 * src_strd];
672 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_src[4 * src_strd]
673 + g_ai2_ihevc_trans_16[12][1]
674 * pi2_src[12 * src_strd];
675 eee[1] =
676 g_ai2_ihevc_trans_16[0][1] * pi2_src[0]
677 + g_ai2_ihevc_trans_16[8][1]
678 * pi2_src[8
679 * src_strd];
680
681 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
682 for(k = 0; k < 2; k++)
683 {
684 ee[k] = eee[k] + eeo[k];
685 ee[k + 2] = eee[1 - k] - eeo[1 - k];
686 }
687 for(k = 0; k < 4; k++)
688 {
689 e[k] = ee[k] + eo[k];
690 e[k + 4] = ee[3 - k] - eo[3 - k];
691 }
692 for(k = 0; k < 8; k++)
693 {
694 pi2_tmp[k] =
695 CLIP_S16(((e[k] + o[k] + add) >> shift));
696 pi2_tmp[k + 8] =
697 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
698 }
699 }
700 pi2_src++;
701 pi2_tmp += trans_size;
702 zero_cols = zero_cols >> 1;
703 }
704
705 pi2_tmp = pi2_tmp_orig;
706
707 /* Inverse Transform 2nd stage */
708 shift = IT_SHIFT_STAGE_2;
709 add = 1 << (shift - 1);
710
711 if((zero_rows_2nd_stage & 0xFFF0) == 0xFFF0) /* First 4 rows of output of 1st stage are non-zero */
712 {
713 for(j = 0; j < trans_size; j++)
714 {
715 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
716 for(k = 0; k < 8; k++)
717 {
718 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
719 + g_ai2_ihevc_trans_16[3][k]
720 * pi2_tmp[3 * trans_size];
721 }
722 for(k = 0; k < 4; k++)
723 {
724 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size];
725 }
726 eeo[0] = 0;
727 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
728 eeo[1] = 0;
729 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
730
731 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
732 for(k = 0; k < 2; k++)
733 {
734 ee[k] = eee[k] + eeo[k];
735 ee[k + 2] = eee[1 - k] - eeo[1 - k];
736 }
737 for(k = 0; k < 4; k++)
738 {
739 e[k] = ee[k] + eo[k];
740 e[k + 4] = ee[3 - k] - eo[3 - k];
741 }
742 for(k = 0; k < 8; k++)
743 {
744 WORD32 itrans_out;
745 itrans_out =
746 CLIP_S16(((e[k] + o[k] + add) >> shift));
747 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
748 itrans_out =
749 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
750 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
751 }
752 pi2_tmp++;
753 pu1_pred += pred_strd;
754 pu1_dst += dst_strd;
755 }
756 }
757 else if((zero_rows_2nd_stage & 0xFF00) == 0xFF00) /* First 4 rows of output of 1st stage are non-zero */
758 {
759 for(j = 0; j < trans_size; j++)
760 {
761 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
762 for(k = 0; k < 8; k++)
763 {
764 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
765 + g_ai2_ihevc_trans_16[3][k]
766 * pi2_tmp[3 * trans_size]
767 + g_ai2_ihevc_trans_16[5][k]
768 * pi2_tmp[5 * trans_size]
769 + g_ai2_ihevc_trans_16[7][k]
770 * pi2_tmp[7 * trans_size];
771 }
772 for(k = 0; k < 4; k++)
773 {
774 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
775 + g_ai2_ihevc_trans_16[6][k]
776 * pi2_tmp[6 * trans_size];
777 }
778 eeo[0] = g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size];
779 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0];
780 eeo[1] = g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size];
781 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0];
782
783 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
784 for(k = 0; k < 2; k++)
785 {
786 ee[k] = eee[k] + eeo[k];
787 ee[k + 2] = eee[1 - k] - eeo[1 - k];
788 }
789 for(k = 0; k < 4; k++)
790 {
791 e[k] = ee[k] + eo[k];
792 e[k + 4] = ee[3 - k] - eo[3 - k];
793 }
794 for(k = 0; k < 8; k++)
795 {
796 WORD32 itrans_out;
797 itrans_out =
798 CLIP_S16(((e[k] + o[k] + add) >> shift));
799 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
800 itrans_out =
801 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
802 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
803 }
804 pi2_tmp++;
805 pu1_pred += pred_strd;
806 pu1_dst += dst_strd;
807 }
808 }
809 else /* All rows of output of 1st stage are non-zero */
810 {
811 for(j = 0; j < trans_size; j++)
812 {
813 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
814 for(k = 0; k < 8; k++)
815 {
816 o[k] = g_ai2_ihevc_trans_16[1][k] * pi2_tmp[trans_size]
817 + g_ai2_ihevc_trans_16[3][k]
818 * pi2_tmp[3 * trans_size]
819 + g_ai2_ihevc_trans_16[5][k]
820 * pi2_tmp[5 * trans_size]
821 + g_ai2_ihevc_trans_16[7][k]
822 * pi2_tmp[7 * trans_size]
823 + g_ai2_ihevc_trans_16[9][k]
824 * pi2_tmp[9 * trans_size]
825 + g_ai2_ihevc_trans_16[11][k]
826 * pi2_tmp[11 * trans_size]
827 + g_ai2_ihevc_trans_16[13][k]
828 * pi2_tmp[13 * trans_size]
829 + g_ai2_ihevc_trans_16[15][k]
830 * pi2_tmp[15 * trans_size];
831 }
832 for(k = 0; k < 4; k++)
833 {
834 eo[k] = g_ai2_ihevc_trans_16[2][k] * pi2_tmp[2 * trans_size]
835 + g_ai2_ihevc_trans_16[6][k]
836 * pi2_tmp[6 * trans_size]
837 + g_ai2_ihevc_trans_16[10][k]
838 * pi2_tmp[10 * trans_size]
839 + g_ai2_ihevc_trans_16[14][k]
840 * pi2_tmp[14 * trans_size];
841 }
842 eeo[0] =
843 g_ai2_ihevc_trans_16[4][0] * pi2_tmp[4 * trans_size]
844 + g_ai2_ihevc_trans_16[12][0]
845 * pi2_tmp[12
846 * trans_size];
847 eee[0] = g_ai2_ihevc_trans_16[0][0] * pi2_tmp[0]
848 + g_ai2_ihevc_trans_16[8][0] * pi2_tmp[8 * trans_size];
849 eeo[1] =
850 g_ai2_ihevc_trans_16[4][1] * pi2_tmp[4 * trans_size]
851 + g_ai2_ihevc_trans_16[12][1]
852 * pi2_tmp[12
853 * trans_size];
854 eee[1] = g_ai2_ihevc_trans_16[0][1] * pi2_tmp[0]
855 + g_ai2_ihevc_trans_16[8][1] * pi2_tmp[8 * trans_size];
856
857 /* Combining e and o terms at each hierarchy levels to calculate the final spatial domain vector */
858 for(k = 0; k < 2; k++)
859 {
860 ee[k] = eee[k] + eeo[k];
861 ee[k + 2] = eee[1 - k] - eeo[1 - k];
862 }
863 for(k = 0; k < 4; k++)
864 {
865 e[k] = ee[k] + eo[k];
866 e[k + 4] = ee[3 - k] - eo[3 - k];
867 }
868 for(k = 0; k < 8; k++)
869 {
870 WORD32 itrans_out;
871 itrans_out =
872 CLIP_S16(((e[k] + o[k] + add) >> shift));
873 pu1_dst[k] = CLIP_U8((itrans_out + pu1_pred[k]));
874 itrans_out =
875 CLIP_S16(((e[7 - k] - o[7 - k] + add) >> shift));
876 pu1_dst[k + 8] = CLIP_U8((itrans_out + pu1_pred[k + 8]));
877 }
878 pi2_tmp++;
879 pu1_pred += pred_strd;
880 pu1_dst += dst_strd;
881 }
882 }
883 /************************************************************************************************/
884 /************************************END - IT_RECON_16x16****************************************/
885 /************************************************************************************************/
886 }
887
888}
889