blob: cf1d43c9b7b3af2aac1de74d336416d7d3968833 [file] [log] [blame]
Hamsalekha S8d3d3032015-03-13 21:24:58 +05301/******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*/
20/**
21 *******************************************************************************
22 * @file
23 * ih264_resi_trans_quant.c
24 *
25 * @brief
26 * Contains function definitions single stage forward transform for H.264
27 * It will calculate the residue, do the cf and then do quantization
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - ih264_resi_trans_quant_4x4()
34 * - ih264_resi_trans_quant_chroma_4x4
35 * - ih264_hadamard_quant_4x4
36 * - ih264_hadamard_quant_2x2_uv
37 * - ih264_resi_trans_quant_8x8
38 *
39 * @remarks
40 *******************************************************************************
41 */
42
43/*****************************************************************************/
44/* File Includes */
45/*****************************************************************************/
46
47/* System include files */
48#include <stddef.h>
49
50/* User include files */
51#include "ih264_typedefs.h"
52#include "ih264_defs.h"
53#include "ih264_size_defs.h"
54#include "ih264_macros.h"
55#include "ih264_trans_macros.h"
56#include "ih264_trans_data.h"
57#include "ih264_structs.h"
58#include "ih264_trans_quant_itrans_iquant.h"
59
60/**
61 *******************************************************************************
62 *
63 * @brief
64 * This function performs forward transform and quantization on a 4*4 block
65 *
66 * @par Description:
67 * The function accepts source buffer and estimation buffer. From these, it
68 * computes the residue. This is residue is then transformed and quantized.
69 * The transform and quantization are in placed computed. They use the residue
70 * buffer for this.
71 *
72 * @param[in] pu1_src
73 * Pointer to source sub-block
74 *
75 * @param[in] pu1_pred
76 * Pointer to prediction sub-block
77 *
78 * @param[in] pi2_out
79 * Pointer to residual sub-block
80 *
81 * @param[in] src_strd
82 * Source stride
83 *
84 * @param[in] pred_strd
85 * Prediction stride
86 *
87 * @param[in] dst_strd
88 * Destination stride
89 *
90 * @param[in] u4_qbits
91 * QP_BITS_h264_4x4 + floor(QP/6)
92 *
93 * @param[in] pu2_threshold_matrix
94 * Pointer to Forward Quant Threshold Matrix
95 *
96 * @param[in] pu2_scale_matrix
97 * Pointer to Forward Quant Scale Matrix
98 *
99 * @param[in] u4_round_factor
100 * Quantization Round factor
101 *
102 * @param[out] pu1_nnz
103 * Total non-zero coefficients in the current sub-block
104 *
105 * @returns
106 *
107 * @remarks
108 * None
109 *
110 *******************************************************************************
111 */
112void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
113 UWORD8 *pu1_pred,
114 WORD16 *pi2_out,
115 WORD32 src_strd,
116 WORD32 pred_strd,
117 const UWORD16 *pu2_scale_matrix,
118 const UWORD16 *pu2_threshold_matrix,
119 UWORD32 u4_qbits,
120 UWORD32 u4_round_factor,
121 UWORD8 *pu1_nnz,
122 WORD16 *pi2_alt_dc_addr)
123{
124 UWORD32 i;
125 WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
126 WORD32 i4_value, i4_sign;
127 UWORD32 u4_abs_value;
128 WORD16 *pi2_out_tmp = pi2_out;
129 UWORD32 u4_nonzero_coeff = 0;
130
131 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
132 {
133 /* computing prediction error (residue) */
134 x4 = pu1_src[0] - pu1_pred[0];
135 x5 = pu1_src[1] - pu1_pred[1];
136 x6 = pu1_src[2] - pu1_pred[2];
137 x7 = pu1_src[3] - pu1_pred[3];
138
139 /* Horizontal transform */
140 x0 = x4 + x7;
141 x1 = x5 + x6;
142 x2 = x5 - x6;
143 x3 = x4 - x7;
144
145 pi2_out_tmp[0] = x0 + x1;
146 pi2_out_tmp[1] = (x3 <<1) + x2;
147 pi2_out_tmp[2] = x0 - x1;
148 pi2_out_tmp[3] = x3 - (x2<<1);
149
150 /* pointing to next row; */
151 pu1_src += src_strd;
152 pu1_pred += pred_strd;
153 pi2_out_tmp += 4;
154
155 }
156 pi2_out_tmp = pi2_out;
157 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
158 {
159
160 /* Vertical transform and quantization */
161 x4 = pi2_out_tmp[0];
162 x5 = pi2_out_tmp[4];
163 x6 = pi2_out_tmp[8];
164 x7 = pi2_out_tmp[12];
165
166
167 x0 = x4 + x7;
168 x1 = x5 + x6;
169 x2 = x5 - x6;
170 x3 = x4 - x7;
171
172 /* quantization is done in place */
173
174 i4_value = x0 + x1;
175
176 if(i==0)
177 {
178 (*pi2_alt_dc_addr) = i4_value;
179 }
180
181 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff);
182 pi2_out_tmp[0] = i4_value;
183
184
185 i4_value = (x3 << 1) + x2;
186 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff);
187 pi2_out_tmp[4] = i4_value;
188
189
190 i4_value = x0 - x1;
191 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff);
192 pi2_out_tmp[8] = i4_value;
193
194
195 i4_value = x3 - (x2 << 1);
196 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff);
197 pi2_out_tmp[12] = i4_value;
198
199 pi2_out_tmp ++;
200 pu2_scale_matrix++;
201 pu2_threshold_matrix++;
202 }
203
204 /* Return total nonzero coefficients in the current sub block */
205 *pu1_nnz = u4_nonzero_coeff;
206}
207/**
208 *******************************************************************************
209 *
210 * @brief
211 * This function performs forward transform and quantization on a 4*4 chroma block
212 * with interleaved values
213 *
214 * @par Description:
215 * The function accepts source buffer and estimation buffer. From these, it
216 * computes the residue. This is residue is then transformed and quantized.
217 * The transform and quantization are in placed computed. They use the residue
218 * buffer for this.
219 *
220 * @param[in] pu1_src
221 * Pointer to source sub-block
222 *
223 * @param[in] pu1_pred
224 * Pointer to prediction sub-block
225 *
226 * @param[in] pi2_out
227 * Pointer to residual sub-block
228 *
229 * @param[in] src_strd
230 * Source stride
231 *
232 * @param[in] pred_strd
233 * Prediction stride
234 *
235 * @param[in] dst_strd
236 * Destination stride
237 *
238 * @param[in] u4_qbits
239 * QP_BITS_h264_4x4 + floor(QP/6)
240 *
241 * @param[in] pu2_threshold_matrix
242 * Pointer to Forward Quant Threshold Matrix
243 *
244 * @param[in] pu2_scale_matrix
245 * Pointer to Forward Quant Scale Matrix
246 *
247 * @param[in] u4_round_factor
248 * Quantization Round factor
249 *
250 * @param[out] pu1_nnz
251 * Total non-zero coefficients in the current sub-block
252 *
253 * @returns
254 *
255 * @remarks
256 * None
257 *
258 *******************************************************************************
259 */
260void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
261 UWORD8 *pu1_pred,
262 WORD16 *pi2_out,
263 WORD32 src_strd,
264 WORD32 pred_strd,
265 const UWORD16 *pu2_scale_matrix,
266 const UWORD16 *pu2_threshold_matrix,
267 UWORD32 u4_qbits,
268 UWORD32 u4_round_factor,
269 UWORD8 *pu1_nnz,
270 WORD16 *pu1_dc_alt_addr)
271{
272 UWORD32 i;
273 WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
274 WORD32 i4_value, i4_sign;
275 UWORD32 u4_abs_value;
276 WORD16 *pi2_out_tmp = pi2_out;
277 UWORD32 u4_nonzero_coeff = 0;
278
279 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
280 {
281 /* computing prediction error (residue) */
282 x4 = pu1_src[0] - pu1_pred[0];
283 x5 = pu1_src[2] - pu1_pred[2];
284 x6 = pu1_src[4] - pu1_pred[4];
285 x7 = pu1_src[6] - pu1_pred[6];
286
287 /* Horizontal transform */
288 x0 = x4 + x7;
289 x1 = x5 + x6;
290 x2 = x5 - x6;
291 x3 = x4 - x7;
292
293 pi2_out_tmp[0] = x0 + x1;
294 pi2_out_tmp[1] = (x3 <<1) + x2;
295 pi2_out_tmp[2] = x0 - x1;
296 pi2_out_tmp[3] = x3 - (x2<<1);
297
298 /* pointing to next row; */
299 pu1_src += src_strd;
300 pu1_pred += pred_strd;
301 pi2_out_tmp += 4;
302
303 }
304 pi2_out_tmp = pi2_out;
305 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
306 {
307
308 /* Vertical transform and quantization */
309 x4 = pi2_out_tmp[0];
310 x5 = pi2_out_tmp[4];
311 x6 = pi2_out_tmp[8];
312 x7 = pi2_out_tmp[12];
313
314
315 x0 = x4 + x7;
316 x1 = x5 + x6;
317 x2 = x5 - x6;
318 x3 = x4 - x7;
319
320 /* quantization is done in place */
321
322 i4_value = x0 + x1;
323
324 if(i==0)
325 {
326 *pu1_dc_alt_addr = i4_value;
327 }
328
329 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
330 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
331 u4_nonzero_coeff);
332 pi2_out_tmp[0] = i4_value;
333
334 i4_value = (x3 << 1) + x2;
335 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4],
336 pu2_scale_matrix[4], u4_round_factor, u4_qbits,
337 u4_nonzero_coeff);
338 pi2_out_tmp[4] = i4_value;
339
340 i4_value = x0 - x1;
341 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
342 pu2_scale_matrix[8], u4_round_factor, u4_qbits,
343 u4_nonzero_coeff);
344 pi2_out_tmp[8] = i4_value;
345
346 i4_value = x3 - (x2 << 1);
347 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12],
348 pu2_scale_matrix[12], u4_round_factor, u4_qbits,
349 u4_nonzero_coeff);
350 pi2_out_tmp[12] = i4_value;
351
352 pi2_out_tmp ++;
353 pu2_scale_matrix++;
354 pu2_threshold_matrix++;
355 }
356
357 /* Return total nonzero coefficients in the current sub block */
358 *pu1_nnz = u4_nonzero_coeff;
359}
360
361/**
362 *******************************************************************************
363 *
364 * @brief
365 * This function performs forward hadamard transform and quantization on a 4*4 block
366 *
367 * @par Description:
368 * The function accepts source buffer and estimation buffer. From these, it
369 * computes the residue. This is residue is then transformed and quantized.
370 * The transform and quantization are in placed computed. They use the residue
371 * buffer for this.
372 *
373 * @param[in] pu1_src
374 * Pointer to source sub-block
375 *
376 * @param[in] pu1_pred
377 * Pointer to prediction sub-block
378 *
379 * @param[in] pi2_out
380 * Pointer to residual sub-block
381 *
382 * @param[in] src_strd
383 * Source stride
384 *
385 * @param[in] pred_strd
386 * Prediction stride
387 *
388 * @param[in] dst_strd
389 * Destination stride
390 *
391 * @param[in] u4_qbits
392 * QP_BITS_h264_4x4 + floor(QP/6)
393 *
394 * @param[in] pu2_threshold_matrix
395 * Pointer to Forward Quant Threshold Matrix
396 *
397 * @param[in] pu2_scale_matrix
398 * Pointer to Forward Quant Scale Matrix
399 *
400 * @param[in] u4_round_factor
401 * Quantization Round factor
402 *
403 * @param[out] pu1_nnz
404 * Total non-zero coefficients in the current sub-block
405 *
406 * @returns
407 *
408 * @remarks
409 * None
410 *
411 */
412
413void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
414 WORD16 *pi2_dst,
415 const UWORD16 *pu2_scale_matrix,
416 const UWORD16 *pu2_threshold_matrix,
417 UWORD32 u4_qbits,
418 UWORD32 u4_round_factor,
419 UWORD8 *pu1_nnz)
420{
421 WORD32 i;
422 WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value;
423 UWORD32 u4_abs_value;
424 WORD32 i4_sign;
425
426 *pu1_nnz = 0;
427
428 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
429 {
430 x4 = pi2_src[0];
431 x5 = pi2_src[1];
432 x6 = pi2_src[2];
433 x7 = pi2_src[3];
434
435 x0 = x4 + x7;
436 x1 = x5 + x6;
437 x2 = x5 - x6;
438 x3 = x4 - x7;
439
440 pi2_dst[0] = x0 + x1;
441 pi2_dst[1] = x3 + x2;
442 pi2_dst[2] = x0 - x1;
443 pi2_dst[3] = x3 - x2;
444
445 pi2_src += 4;
446 pi2_dst += 4;
447 }
448
449 /* Vertical transform and quantization */
450 pi2_dst -= SUB_BLK_WIDTH_4x4<<2;
451
452 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
453 {
454 x4 = pi2_dst[0];
455 x5 = pi2_dst[4];
456 x6 = pi2_dst[8];
457 x7 = pi2_dst[12] ;
458
459 x0 = x4 + x7;
460 x1 = x5 + x6;
461 x2 = x5 - x6;
462 x3 = x4 - x7;
463
464
465 i4_value = (x0 + x1) >> 1;
466 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
467 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
468 pi2_dst[0] = i4_value;
469
470 i4_value = (x3 + x2) >> 1;
471 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
472 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
473 pi2_dst[4] = i4_value;
474
475 i4_value = (x0 - x1) >> 1;
476 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
477 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
478 pi2_dst[8] = i4_value;
479
480 i4_value = (x3 - x2) >> 1;
481 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
482 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
483 pi2_dst[12] = i4_value;
484
485 pi2_dst ++;
486 }
487}
488
489/**
490 *******************************************************************************
491 *
492 * @brief
493 * This function performs forward hadamard transform and quantization on a 2*2 block
494 * for both U and V planes
495 *
496 * @par Description:
497 * The function accepts source buffer and estimation buffer. From these, it
498 * computes the residue. This is residue is then transformed and quantized.
499 * The transform and quantization are in placed computed. They use the residue
500 * buffer for this.
501 *
502 * @param[in] pu1_src
503 * Pointer to source sub-block
504 *
505 * @param[in] pu1_pred
506 * Pointer to prediction sub-block
507 *
508 * @param[in] pi2_out
509 * Pointer to residual sub-block
510 *
511 * @param[in] src_strd
512 * Source stride
513 *
514 * @param[in] pred_strd
515 * Prediction stride
516 *
517 * @param[in] dst_strd
518 * Destination stride
519 *
520 * @param[in] u4_qbits
521 * QP_BITS_h264_4x4 + floor(QP/6)
522 *
523 * @param[in] pu2_threshold_matrix
524 * Pointer to Forward Quant Threshold Matrix
525 *
526 * @param[in] pu2_scale_matrix
527 * Pointer to Forward Quant Scale Matrix
528 *
529 * @param[in] u4_round_factor
530 * Quantization Round factor
531 *
532 * @param[out] pu1_nnz
533 * Total non-zero coefficients in the current sub-block
534 *
535 * @returns
536 *
537 * @remarks
538 * NNZ for dc is populated at 0 and 5th position of pu1_nnz
539 *
540 */
541
542void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
543 WORD16 *pi2_dst,
544 const UWORD16 *pu2_scale_matrix,
545 const UWORD16 *pu2_threshold_matrix,
546 UWORD32 u4_qbits,
547 UWORD32 u4_round_factor,
548 UWORD8 *pu1_nnz)
549{
550 WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
551 WORD32 i4_value, i4_sign, plane;
552 UWORD32 u4_abs_value;
553
554 for(plane = 0; plane < 2; plane++)
555 {
556 pu1_nnz[plane] = 0;
557
558 /* Horizontal transform */
559 x4 = pi2_src[0];
560 x5 = pi2_src[1];
561 x6 = pi2_src[2];
562 x7 = pi2_src[3];
563
564 x0 = x4 + x5;
565 x1 = x4 - x5;
566 x2 = x6 + x7;
567 x3 = x6 - x7;
568
569 /* Vertical transform and quantization */
570 i4_value = (x0 + x2);
571 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
572 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
573 pu1_nnz[plane]);
574 pi2_dst[0] = i4_value;
575
576 i4_value = (x0 - x2);
577 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
578 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
579 pu1_nnz[plane]);
580 pi2_dst[2] = i4_value;
581
582 i4_value = (x1 - x3);
583 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
584 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
585 pu1_nnz[plane]);
586 pi2_dst[3] = i4_value;
587
588 i4_value = (x1 + x3);
589 FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
590 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
591 pu1_nnz[plane]);
592 pi2_dst[1] = i4_value;
593
594 pi2_dst += 4;
595 pi2_src += 4;
596
597 }
598}
599
600/*
601 *******************************************************************************
602 *
603 * @brief
604 * This function performs Single stage forward transform CF8 and quantization on 8*8 blocks
605 * for h.264
606 *
607 * @par Description:
608 * Performs single stage 8x8 forward transform CF8 after calculating the residue
609 * The result is then quantized
610 *
611 * @param[in] pu1_src
612 * Input 8x8 pixels
613 *
614 * @param[in] pu1_pred
615 * Input 8x8 pixels
616 *
617 * @param[in] pi1_out
618 * Output 8x8 pixels
619 *
620 * @param[in] u4_thresh
621 * Threshold under which the coeffs are not quantized
622 *
623 * @param[in] u4_qp_div
624 * QP/6
625 *
626 * @param[in] u4_qp_rem
627 * QP%6
628 *
629 * @param[in] u2_src_stride
630 * Source stride
631 *
632 * @param[in] pred_strd
633 * stride for prediciton buffer
634 *
635 * @param[in] dst_strd
636 * stride for destination buffer
637 *
638 * @param[in] pu4_quant_mat
639 * Pointer to the 4x4 quantization matrix
640 *
641 * @returns Void
642 *
643 *
644 *******************************************************************************
645 */
646void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
647 UWORD8 *pu1_pred,
648 WORD16 *pi2_out,
649 WORD32 src_strd,
650 WORD32 pred_strd,
651 const UWORD16 *pu2_scale_matrix,
652 const UWORD16 *pu2_threshold_matrix,
653 UWORD32 u4_qbits,
654 UWORD32 u4_round_factor,
655 UWORD8 *pu1_nnz,
656 WORD16 *pu1_dc_alt_addr)
657
658{
659 WORD16 *pi2_out_tmp = pi2_out;
660 UWORD32 i;
661 WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
662 WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
663 WORD32 i4_sign;
664 UWORD32 u4_abs_value;
665 UWORD32 u4_nonzero_coeff = 0;
666
667 UNUSED(pu1_dc_alt_addr);
668
669 /*Horizontal transform */
670 /* we are going to use the a's and r's in a twisted way since */
671 /*i dont want to declare more variables */
672 for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
673 {
674 r0 = pu1_src[0];
675 r0 -= pu1_pred[0];
676 r1 = pu1_src[1];
677 r1 -= pu1_pred[1];
678 r2 = pu1_src[2];r2 -= pu1_pred[2];
679 r3 = pu1_src[3];r3 -= pu1_pred[3];
680 r4 = pu1_src[4];r4 -= pu1_pred[4];
681 r5 = pu1_src[5];r5 -= pu1_pred[5];
682 r6 = pu1_src[6];r6 -= pu1_pred[6];
683 r7 = pu1_src[7];r7 -= pu1_pred[7];
684
685
686 a0 = r0 + r7;
687 a1 = r1 + r6;
688 a2 = r2 + r5;
689 a3 = r3 + r4;
690
691 a4 = a0 + a3;
692 a5 = a1 + a2;
693 a6 = a0 - a3;
694 a7 = a1 - a2;
695
696 pi2_out_tmp[0] = a4 + a5;
697
698 pi2_out_tmp[2] = a6 + (a7>>1);
699 pi2_out_tmp[4] = a4 - a5;
700 pi2_out_tmp[6] = (a6>>1) - a7;
701
702 a0 = r0 - r7;
703 a1 = r1 - r6;
704 a2 = r2 - r5;
705 a3 = r3 - r4;
706
707 a4 = a1 + a2 + ((a0>>1) + a0);
708 a5 = a0 - a3 - ((a2>>1) + a2);
709 a6 = a0 + a3 - ((a1>>1) + a1);
710 a7 = a1 - a2 + ((a3>>1) + a3);
711
712 pi2_out_tmp[1] = a4 + (a7>>2);
713 pi2_out_tmp[3] = a5 + (a6>>2);
714 pi2_out_tmp[5] = a6 - (a5>>2);
715 pi2_out_tmp[7] = (a4>>2) - a7;
716
717 pu1_src += src_strd;
718 pu1_pred += pred_strd;
719 pi2_out_tmp += 8;
720 }
721
722 /*vertical transform and quant */
723
724 pi2_out_tmp = pi2_out;
725
726 for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
727 {
728
729 r0 = pi2_out_tmp[0];
730 r1 = pi2_out_tmp[8];
731 r2 = pi2_out_tmp[16];
732 r3 = pi2_out_tmp[24];
733 r4 = pi2_out_tmp[32];
734 r5 = pi2_out_tmp[40];
735 r6 = pi2_out_tmp[48];
736 r7 = pi2_out_tmp[56];
737
738 a0 = r0 + r7;
739 a1 = r1 + r6;
740 a2 = r2 + r5;
741 a3 = r3 + r4;
742
743 a4 = a0 + a3;
744 a5 = a1 + a2;
745 a6 = a0 - a3;
746 a7 = a1 - a2;
747
748 a0 = r0 - r7;
749 a1 = r1 - r6;
750 a2 = r2 - r5;
751 a3 = r3 - r4;
752
753 r0 = a4 + a5;
754 r2 = a6 + (a7>>1);
755 r4 = a4 - a5;
756 r6 = (a6>>1) - a7;
757
758 a4 = a1 + a2 + ((a0>>1) + a0);
759 a5 = a0 - a3 - ((a2>>1) + a2);
760 a6 = a0 + a3 - ((a1>>1) + a1);
761 a7 = a1 - a2 + ((a3>>1) + a3);
762
763 r1 = a4 + (a7>>2);
764 r3 = a5 + (a6>>2);
765 r5 = a6 - (a5>>2);
766 r7 = (a4>>2) - a7;
767
768 FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
769 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
770 u4_nonzero_coeff);
771 pi2_out_tmp[0] = r0;
772
773 FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
774 pu2_scale_matrix[8], u4_round_factor, u4_qbits,
775 u4_nonzero_coeff);
776 pi2_out_tmp[8] = r1;
777
778 FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16],
779 pu2_scale_matrix[16], u4_round_factor, u4_qbits,
780 u4_nonzero_coeff);
781 pi2_out_tmp[16] = r2;
782
783 FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24],
784 pu2_scale_matrix[24], u4_round_factor, u4_qbits,
785 u4_nonzero_coeff);
786 pi2_out_tmp[24] = r3;
787
788 FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32],
789 pu2_scale_matrix[32], u4_round_factor, u4_qbits,
790 u4_nonzero_coeff);
791 pi2_out_tmp[32] = r4;
792
793 FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40],
794 pu2_scale_matrix[40], u4_round_factor, u4_qbits,
795 u4_nonzero_coeff);
796 pi2_out_tmp[40] = r5;
797
798 FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48],
799 pu2_scale_matrix[48], u4_round_factor, u4_qbits,
800 u4_nonzero_coeff);
801 pi2_out_tmp[48] = r6;
802
803 FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56],
804 pu2_scale_matrix[56], u4_round_factor, u4_qbits,
805 u4_nonzero_coeff);
806 pi2_out_tmp[56] = r7;
807
808 pi2_out_tmp++;
809 pu2_scale_matrix++;
810 pu2_threshold_matrix++;
811 }
812 /* Return total nonzero coefficients in the current sub block */
813 *pu1_nnz = u4_nonzero_coeff;
814}