Blame - common/ih264_resi_trans_quant.c - platform/external/libavc

blob: cf1d43c9b7b3af2aac1de74d336416d7d3968833 [file] [log] [blame]

Hamsalekha S	8d3d303	2015-03-13 21:24:58 +0530	[diff] [blame]	1	/******************************************************************************
				2	*
				3	* Copyright (C) 2015 The Android Open Source Project
				4	*
				5	* Licensed under the Apache License, Version 2.0 (the "License");
				6	* you may not use this file except in compliance with the License.
				7	* You may obtain a copy of the License at:
				8	*
				9	* http://www.apache.org/licenses/LICENSE-2.0
				10	*
				11	* Unless required by applicable law or agreed to in writing, software
				12	* distributed under the License is distributed on an "AS IS" BASIS,
				13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	* See the License for the specific language governing permissions and
				15	* limitations under the License.
				16	*
				17	*****************************************************************************
				18	* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
				19	*/
				20	/**
				21	*******************************************************************************
				22	* @file
				23	* ih264_resi_trans_quant.c
				24	*
				25	* @brief
				26	* Contains function definitions single stage forward transform for H.264
				27	* It will calculate the residue, do the cf and then do quantization
				28	*
				29	* @author
				30	* Ittiam
				31	*
				32	* @par List of Functions:
				33	* - ih264_resi_trans_quant_4x4()
				34	* - ih264_resi_trans_quant_chroma_4x4
				35	* - ih264_hadamard_quant_4x4
				36	* - ih264_hadamard_quant_2x2_uv
				37	* - ih264_resi_trans_quant_8x8
				38	*
				39	* @remarks
				40	*******************************************************************************
				41	*/
				42
				43	/*****************************************************************************/
				44	/* File Includes */
				45	/*****************************************************************************/
				46
				47	/* System include files */
				48	#include <stddef.h>
				49
				50	/* User include files */
				51	#include "ih264_typedefs.h"
				52	#include "ih264_defs.h"
				53	#include "ih264_size_defs.h"
				54	#include "ih264_macros.h"
				55	#include "ih264_trans_macros.h"
				56	#include "ih264_trans_data.h"
				57	#include "ih264_structs.h"
				58	#include "ih264_trans_quant_itrans_iquant.h"
				59
				60	/**
				61	*******************************************************************************
				62	*
				63	* @brief
				64	* This function performs forward transform and quantization on a 4*4 block
				65	*
				66	* @par Description:
				67	* The function accepts source buffer and estimation buffer. From these, it
				68	* computes the residue. This is residue is then transformed and quantized.
				69	* The transform and quantization are in placed computed. They use the residue
				70	* buffer for this.
				71	*
				72	* @param[in] pu1_src
				73	* Pointer to source sub-block
				74	*
				75	* @param[in] pu1_pred
				76	* Pointer to prediction sub-block
				77	*
				78	* @param[in] pi2_out
				79	* Pointer to residual sub-block
				80	*
				81	* @param[in] src_strd
				82	* Source stride
				83	*
				84	* @param[in] pred_strd
				85	* Prediction stride
				86	*
				87	* @param[in] dst_strd
				88	* Destination stride
				89	*
				90	* @param[in] u4_qbits
				91	* QP_BITS_h264_4x4 + floor(QP/6)
				92	*
				93	* @param[in] pu2_threshold_matrix
				94	* Pointer to Forward Quant Threshold Matrix
				95	*
				96	* @param[in] pu2_scale_matrix
				97	* Pointer to Forward Quant Scale Matrix
				98	*
				99	* @param[in] u4_round_factor
				100	* Quantization Round factor
				101	*
				102	* @param[out] pu1_nnz
				103	* Total non-zero coefficients in the current sub-block
				104	*
				105	* @returns
				106	*
				107	* @remarks
				108	* None
				109	*
				110	*******************************************************************************
				111	*/
				112	void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
				113	UWORD8 *pu1_pred,
				114	WORD16 *pi2_out,
				115	WORD32 src_strd,
				116	WORD32 pred_strd,
				117	const UWORD16 *pu2_scale_matrix,
				118	const UWORD16 *pu2_threshold_matrix,
				119	UWORD32 u4_qbits,
				120	UWORD32 u4_round_factor,
				121	UWORD8 *pu1_nnz,
				122	WORD16 *pi2_alt_dc_addr)
				123	{
				124	UWORD32 i;
				125	WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
				126	WORD32 i4_value, i4_sign;
				127	UWORD32 u4_abs_value;
				128	WORD16 *pi2_out_tmp = pi2_out;
				129	UWORD32 u4_nonzero_coeff = 0;
				130
				131	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
				132	{
				133	/* computing prediction error (residue) */
				134	x4 = pu1_src[0] - pu1_pred[0];
				135	x5 = pu1_src[1] - pu1_pred[1];
				136	x6 = pu1_src[2] - pu1_pred[2];
				137	x7 = pu1_src[3] - pu1_pred[3];
				138
				139	/* Horizontal transform */
				140	x0 = x4 + x7;
				141	x1 = x5 + x6;
				142	x2 = x5 - x6;
				143	x3 = x4 - x7;
				144
				145	pi2_out_tmp[0] = x0 + x1;
				146	pi2_out_tmp[1] = (x3 <<1) + x2;
				147	pi2_out_tmp[2] = x0 - x1;
				148	pi2_out_tmp[3] = x3 - (x2<<1);
				149
				150	/* pointing to next row; */
				151	pu1_src += src_strd;
				152	pu1_pred += pred_strd;
				153	pi2_out_tmp += 4;
				154
				155	}
				156	pi2_out_tmp = pi2_out;
				157	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
				158	{
				159
				160	/* Vertical transform and quantization */
				161	x4 = pi2_out_tmp[0];
				162	x5 = pi2_out_tmp[4];
				163	x6 = pi2_out_tmp[8];
				164	x7 = pi2_out_tmp[12];
				165
				166
				167	x0 = x4 + x7;
				168	x1 = x5 + x6;
				169	x2 = x5 - x6;
				170	x3 = x4 - x7;
				171
				172	/* quantization is done in place */
				173
				174	i4_value = x0 + x1;
				175
				176	if(i==0)
				177	{
				178	(*pi2_alt_dc_addr) = i4_value;
				179	}
				180
				181	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff);
				182	pi2_out_tmp[0] = i4_value;
				183
				184
				185	i4_value = (x3 << 1) + x2;
				186	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff);
				187	pi2_out_tmp[4] = i4_value;
				188
				189
				190	i4_value = x0 - x1;
				191	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff);
				192	pi2_out_tmp[8] = i4_value;
				193
				194
				195	i4_value = x3 - (x2 << 1);
				196	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff);
				197	pi2_out_tmp[12] = i4_value;
				198
				199	pi2_out_tmp ++;
				200	pu2_scale_matrix++;
				201	pu2_threshold_matrix++;
				202	}
				203
				204	/* Return total nonzero coefficients in the current sub block */
				205	*pu1_nnz = u4_nonzero_coeff;
				206	}
				207	/**
				208	*******************************************************************************
				209	*
				210	* @brief
				211	* This function performs forward transform and quantization on a 4*4 chroma block
				212	* with interleaved values
				213	*
				214	* @par Description:
				215	* The function accepts source buffer and estimation buffer. From these, it
				216	* computes the residue. This is residue is then transformed and quantized.
				217	* The transform and quantization are in placed computed. They use the residue
				218	* buffer for this.
				219	*
				220	* @param[in] pu1_src
				221	* Pointer to source sub-block
				222	*
				223	* @param[in] pu1_pred
				224	* Pointer to prediction sub-block
				225	*
				226	* @param[in] pi2_out
				227	* Pointer to residual sub-block
				228	*
				229	* @param[in] src_strd
				230	* Source stride
				231	*
				232	* @param[in] pred_strd
				233	* Prediction stride
				234	*
				235	* @param[in] dst_strd
				236	* Destination stride
				237	*
				238	* @param[in] u4_qbits
				239	* QP_BITS_h264_4x4 + floor(QP/6)
				240	*
				241	* @param[in] pu2_threshold_matrix
				242	* Pointer to Forward Quant Threshold Matrix
				243	*
				244	* @param[in] pu2_scale_matrix
				245	* Pointer to Forward Quant Scale Matrix
				246	*
				247	* @param[in] u4_round_factor
				248	* Quantization Round factor
				249	*
				250	* @param[out] pu1_nnz
				251	* Total non-zero coefficients in the current sub-block
				252	*
				253	* @returns
				254	*
				255	* @remarks
				256	* None
				257	*
				258	*******************************************************************************
				259	*/
				260	void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
				261	UWORD8 *pu1_pred,
				262	WORD16 *pi2_out,
				263	WORD32 src_strd,
				264	WORD32 pred_strd,
				265	const UWORD16 *pu2_scale_matrix,
				266	const UWORD16 *pu2_threshold_matrix,
				267	UWORD32 u4_qbits,
				268	UWORD32 u4_round_factor,
				269	UWORD8 *pu1_nnz,
				270	WORD16 *pu1_dc_alt_addr)
				271	{
				272	UWORD32 i;
				273	WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
				274	WORD32 i4_value, i4_sign;
				275	UWORD32 u4_abs_value;
				276	WORD16 *pi2_out_tmp = pi2_out;
				277	UWORD32 u4_nonzero_coeff = 0;
				278
				279	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
				280	{
				281	/* computing prediction error (residue) */
				282	x4 = pu1_src[0] - pu1_pred[0];
				283	x5 = pu1_src[2] - pu1_pred[2];
				284	x6 = pu1_src[4] - pu1_pred[4];
				285	x7 = pu1_src[6] - pu1_pred[6];
				286
				287	/* Horizontal transform */
				288	x0 = x4 + x7;
				289	x1 = x5 + x6;
				290	x2 = x5 - x6;
				291	x3 = x4 - x7;
				292
				293	pi2_out_tmp[0] = x0 + x1;
				294	pi2_out_tmp[1] = (x3 <<1) + x2;
				295	pi2_out_tmp[2] = x0 - x1;
				296	pi2_out_tmp[3] = x3 - (x2<<1);
				297
				298	/* pointing to next row; */
				299	pu1_src += src_strd;
				300	pu1_pred += pred_strd;
				301	pi2_out_tmp += 4;
				302
				303	}
				304	pi2_out_tmp = pi2_out;
				305	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
				306	{
				307
				308	/* Vertical transform and quantization */
				309	x4 = pi2_out_tmp[0];
				310	x5 = pi2_out_tmp[4];
				311	x6 = pi2_out_tmp[8];
				312	x7 = pi2_out_tmp[12];
				313
				314
				315	x0 = x4 + x7;
				316	x1 = x5 + x6;
				317	x2 = x5 - x6;
				318	x3 = x4 - x7;
				319
				320	/* quantization is done in place */
				321
				322	i4_value = x0 + x1;
				323
				324	if(i==0)
				325	{
				326	*pu1_dc_alt_addr = i4_value;
				327	}
				328
				329	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				330	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
				331	u4_nonzero_coeff);
				332	pi2_out_tmp[0] = i4_value;
				333
				334	i4_value = (x3 << 1) + x2;
				335	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4],
				336	pu2_scale_matrix[4], u4_round_factor, u4_qbits,
				337	u4_nonzero_coeff);
				338	pi2_out_tmp[4] = i4_value;
				339
				340	i4_value = x0 - x1;
				341	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
				342	pu2_scale_matrix[8], u4_round_factor, u4_qbits,
				343	u4_nonzero_coeff);
				344	pi2_out_tmp[8] = i4_value;
				345
				346	i4_value = x3 - (x2 << 1);
				347	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12],
				348	pu2_scale_matrix[12], u4_round_factor, u4_qbits,
				349	u4_nonzero_coeff);
				350	pi2_out_tmp[12] = i4_value;
				351
				352	pi2_out_tmp ++;
				353	pu2_scale_matrix++;
				354	pu2_threshold_matrix++;
				355	}
				356
				357	/* Return total nonzero coefficients in the current sub block */
				358	*pu1_nnz = u4_nonzero_coeff;
				359	}
				360
				361	/**
				362	*******************************************************************************
				363	*
				364	* @brief
				365	* This function performs forward hadamard transform and quantization on a 4*4 block
				366	*
				367	* @par Description:
				368	* The function accepts source buffer and estimation buffer. From these, it
				369	* computes the residue. This is residue is then transformed and quantized.
				370	* The transform and quantization are in placed computed. They use the residue
				371	* buffer for this.
				372	*
				373	* @param[in] pu1_src
				374	* Pointer to source sub-block
				375	*
				376	* @param[in] pu1_pred
				377	* Pointer to prediction sub-block
				378	*
				379	* @param[in] pi2_out
				380	* Pointer to residual sub-block
				381	*
				382	* @param[in] src_strd
				383	* Source stride
				384	*
				385	* @param[in] pred_strd
				386	* Prediction stride
				387	*
				388	* @param[in] dst_strd
				389	* Destination stride
				390	*
				391	* @param[in] u4_qbits
				392	* QP_BITS_h264_4x4 + floor(QP/6)
				393	*
				394	* @param[in] pu2_threshold_matrix
				395	* Pointer to Forward Quant Threshold Matrix
				396	*
				397	* @param[in] pu2_scale_matrix
				398	* Pointer to Forward Quant Scale Matrix
				399	*
				400	* @param[in] u4_round_factor
				401	* Quantization Round factor
				402	*
				403	* @param[out] pu1_nnz
				404	* Total non-zero coefficients in the current sub-block
				405	*
				406	* @returns
				407	*
				408	* @remarks
				409	* None
				410	*
				411	*/
				412
				413	void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
				414	WORD16 *pi2_dst,
				415	const UWORD16 *pu2_scale_matrix,
				416	const UWORD16 *pu2_threshold_matrix,
				417	UWORD32 u4_qbits,
				418	UWORD32 u4_round_factor,
				419	UWORD8 *pu1_nnz)
				420	{
				421	WORD32 i;
				422	WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value;
				423	UWORD32 u4_abs_value;
				424	WORD32 i4_sign;
				425
				426	*pu1_nnz = 0;
				427
				428	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
				429	{
				430	x4 = pi2_src[0];
				431	x5 = pi2_src[1];
				432	x6 = pi2_src[2];
				433	x7 = pi2_src[3];
				434
				435	x0 = x4 + x7;
				436	x1 = x5 + x6;
				437	x2 = x5 - x6;
				438	x3 = x4 - x7;
				439
				440	pi2_dst[0] = x0 + x1;
				441	pi2_dst[1] = x3 + x2;
				442	pi2_dst[2] = x0 - x1;
				443	pi2_dst[3] = x3 - x2;
				444
				445	pi2_src += 4;
				446	pi2_dst += 4;
				447	}
				448
				449	/* Vertical transform and quantization */
				450	pi2_dst -= SUB_BLK_WIDTH_4x4<<2;
				451
				452	for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
				453	{
				454	x4 = pi2_dst[0];
				455	x5 = pi2_dst[4];
				456	x6 = pi2_dst[8];
				457	x7 = pi2_dst[12] ;
				458
				459	x0 = x4 + x7;
				460	x1 = x5 + x6;
				461	x2 = x5 - x6;
				462	x3 = x4 - x7;
				463
				464
				465	i4_value = (x0 + x1) >> 1;
				466	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				467	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
				468	pi2_dst[0] = i4_value;
				469
				470	i4_value = (x3 + x2) >> 1;
				471	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				472	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
				473	pi2_dst[4] = i4_value;
				474
				475	i4_value = (x0 - x1) >> 1;
				476	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				477	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
				478	pi2_dst[8] = i4_value;
				479
				480	i4_value = (x3 - x2) >> 1;
				481	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				482	pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
				483	pi2_dst[12] = i4_value;
				484
				485	pi2_dst ++;
				486	}
				487	}
				488
				489	/**
				490	*******************************************************************************
				491	*
				492	* @brief
				493	* This function performs forward hadamard transform and quantization on a 2*2 block
				494	* for both U and V planes
				495	*
				496	* @par Description:
				497	* The function accepts source buffer and estimation buffer. From these, it
				498	* computes the residue. This is residue is then transformed and quantized.
				499	* The transform and quantization are in placed computed. They use the residue
				500	* buffer for this.
				501	*
				502	* @param[in] pu1_src
				503	* Pointer to source sub-block
				504	*
				505	* @param[in] pu1_pred
				506	* Pointer to prediction sub-block
				507	*
				508	* @param[in] pi2_out
				509	* Pointer to residual sub-block
				510	*
				511	* @param[in] src_strd
				512	* Source stride
				513	*
				514	* @param[in] pred_strd
				515	* Prediction stride
				516	*
				517	* @param[in] dst_strd
				518	* Destination stride
				519	*
				520	* @param[in] u4_qbits
				521	* QP_BITS_h264_4x4 + floor(QP/6)
				522	*
				523	* @param[in] pu2_threshold_matrix
				524	* Pointer to Forward Quant Threshold Matrix
				525	*
				526	* @param[in] pu2_scale_matrix
				527	* Pointer to Forward Quant Scale Matrix
				528	*
				529	* @param[in] u4_round_factor
				530	* Quantization Round factor
				531	*
				532	* @param[out] pu1_nnz
				533	* Total non-zero coefficients in the current sub-block
				534	*
				535	* @returns
				536	*
				537	* @remarks
				538	* NNZ for dc is populated at 0 and 5th position of pu1_nnz
				539	*
				540	*/
				541
				542	void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
				543	WORD16 *pi2_dst,
				544	const UWORD16 *pu2_scale_matrix,
				545	const UWORD16 *pu2_threshold_matrix,
				546	UWORD32 u4_qbits,
				547	UWORD32 u4_round_factor,
				548	UWORD8 *pu1_nnz)
				549	{
				550	WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
				551	WORD32 i4_value, i4_sign, plane;
				552	UWORD32 u4_abs_value;
				553
				554	for(plane = 0; plane < 2; plane++)
				555	{
				556	pu1_nnz[plane] = 0;
				557
				558	/* Horizontal transform */
				559	x4 = pi2_src[0];
				560	x5 = pi2_src[1];
				561	x6 = pi2_src[2];
				562	x7 = pi2_src[3];
				563
				564	x0 = x4 + x5;
				565	x1 = x4 - x5;
				566	x2 = x6 + x7;
				567	x3 = x6 - x7;
				568
				569	/* Vertical transform and quantization */
				570	i4_value = (x0 + x2);
				571	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				572	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
				573	pu1_nnz[plane]);
				574	pi2_dst[0] = i4_value;
				575
				576	i4_value = (x0 - x2);
				577	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				578	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
				579	pu1_nnz[plane]);
				580	pi2_dst[2] = i4_value;
				581
				582	i4_value = (x1 - x3);
				583	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				584	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
				585	pu1_nnz[plane]);
				586	pi2_dst[3] = i4_value;
				587
				588	i4_value = (x1 + x3);
				589	FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				590	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
				591	pu1_nnz[plane]);
				592	pi2_dst[1] = i4_value;
				593
				594	pi2_dst += 4;
				595	pi2_src += 4;
				596
				597	}
				598	}
				599
				600	/*
				601	*******************************************************************************
				602	*
				603	* @brief
				604	* This function performs Single stage forward transform CF8 and quantization on 8*8 blocks
				605	* for h.264
				606	*
				607	* @par Description:
				608	* Performs single stage 8x8 forward transform CF8 after calculating the residue
				609	* The result is then quantized
				610	*
				611	* @param[in] pu1_src
				612	* Input 8x8 pixels
				613	*
				614	* @param[in] pu1_pred
				615	* Input 8x8 pixels
				616	*
				617	* @param[in] pi1_out
				618	* Output 8x8 pixels
				619	*
				620	* @param[in] u4_thresh
				621	* Threshold under which the coeffs are not quantized
				622	*
				623	* @param[in] u4_qp_div
				624	* QP/6
				625	*
				626	* @param[in] u4_qp_rem
				627	* QP%6
				628	*
				629	* @param[in] u2_src_stride
				630	* Source stride
				631	*
				632	* @param[in] pred_strd
				633	* stride for prediciton buffer
				634	*
				635	* @param[in] dst_strd
				636	* stride for destination buffer
				637	*
				638	* @param[in] pu4_quant_mat
				639	* Pointer to the 4x4 quantization matrix
				640	*
				641	* @returns Void
				642	*
				643	*
				644	*******************************************************************************
				645	*/
				646	void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
				647	UWORD8 *pu1_pred,
				648	WORD16 *pi2_out,
				649	WORD32 src_strd,
				650	WORD32 pred_strd,
				651	const UWORD16 *pu2_scale_matrix,
				652	const UWORD16 *pu2_threshold_matrix,
				653	UWORD32 u4_qbits,
				654	UWORD32 u4_round_factor,
				655	UWORD8 *pu1_nnz,
				656	WORD16 *pu1_dc_alt_addr)
				657
				658	{
				659	WORD16 *pi2_out_tmp = pi2_out;
				660	UWORD32 i;
				661	WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
				662	WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
				663	WORD32 i4_sign;
				664	UWORD32 u4_abs_value;
				665	UWORD32 u4_nonzero_coeff = 0;
				666
				667	UNUSED(pu1_dc_alt_addr);
				668
				669	/Horizontal transform /
				670	/* we are going to use the a's and r's in a twisted way since */
				671	/i dont want to declare more variables /
				672	for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
				673	{
				674	r0 = pu1_src[0];
				675	r0 -= pu1_pred[0];
				676	r1 = pu1_src[1];
				677	r1 -= pu1_pred[1];
				678	r2 = pu1_src[2];r2 -= pu1_pred[2];
				679	r3 = pu1_src[3];r3 -= pu1_pred[3];
				680	r4 = pu1_src[4];r4 -= pu1_pred[4];
				681	r5 = pu1_src[5];r5 -= pu1_pred[5];
				682	r6 = pu1_src[6];r6 -= pu1_pred[6];
				683	r7 = pu1_src[7];r7 -= pu1_pred[7];
				684
				685
				686	a0 = r0 + r7;
				687	a1 = r1 + r6;
				688	a2 = r2 + r5;
				689	a3 = r3 + r4;
				690
				691	a4 = a0 + a3;
				692	a5 = a1 + a2;
				693	a6 = a0 - a3;
				694	a7 = a1 - a2;
				695
				696	pi2_out_tmp[0] = a4 + a5;
				697
				698	pi2_out_tmp[2] = a6 + (a7>>1);
				699	pi2_out_tmp[4] = a4 - a5;
				700	pi2_out_tmp[6] = (a6>>1) - a7;
				701
				702	a0 = r0 - r7;
				703	a1 = r1 - r6;
				704	a2 = r2 - r5;
				705	a3 = r3 - r4;
				706
				707	a4 = a1 + a2 + ((a0>>1) + a0);
				708	a5 = a0 - a3 - ((a2>>1) + a2);
				709	a6 = a0 + a3 - ((a1>>1) + a1);
				710	a7 = a1 - a2 + ((a3>>1) + a3);
				711
				712	pi2_out_tmp[1] = a4 + (a7>>2);
				713	pi2_out_tmp[3] = a5 + (a6>>2);
				714	pi2_out_tmp[5] = a6 - (a5>>2);
				715	pi2_out_tmp[7] = (a4>>2) - a7;
				716
				717	pu1_src += src_strd;
				718	pu1_pred += pred_strd;
				719	pi2_out_tmp += 8;
				720	}
				721
				722	/vertical transform and quant /
				723
				724	pi2_out_tmp = pi2_out;
				725
				726	for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
				727	{
				728
				729	r0 = pi2_out_tmp[0];
				730	r1 = pi2_out_tmp[8];
				731	r2 = pi2_out_tmp[16];
				732	r3 = pi2_out_tmp[24];
				733	r4 = pi2_out_tmp[32];
				734	r5 = pi2_out_tmp[40];
				735	r6 = pi2_out_tmp[48];
				736	r7 = pi2_out_tmp[56];
				737
				738	a0 = r0 + r7;
				739	a1 = r1 + r6;
				740	a2 = r2 + r5;
				741	a3 = r3 + r4;
				742
				743	a4 = a0 + a3;
				744	a5 = a1 + a2;
				745	a6 = a0 - a3;
				746	a7 = a1 - a2;
				747
				748	a0 = r0 - r7;
				749	a1 = r1 - r6;
				750	a2 = r2 - r5;
				751	a3 = r3 - r4;
				752
				753	r0 = a4 + a5;
				754	r2 = a6 + (a7>>1);
				755	r4 = a4 - a5;
				756	r6 = (a6>>1) - a7;
				757
				758	a4 = a1 + a2 + ((a0>>1) + a0);
				759	a5 = a0 - a3 - ((a2>>1) + a2);
				760	a6 = a0 + a3 - ((a1>>1) + a1);
				761	a7 = a1 - a2 + ((a3>>1) + a3);
				762
				763	r1 = a4 + (a7>>2);
				764	r3 = a5 + (a6>>2);
				765	r5 = a6 - (a5>>2);
				766	r7 = (a4>>2) - a7;
				767
				768	FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
				769	pu2_scale_matrix[0], u4_round_factor, u4_qbits,
				770	u4_nonzero_coeff);
				771	pi2_out_tmp[0] = r0;
				772
				773	FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
				774	pu2_scale_matrix[8], u4_round_factor, u4_qbits,
				775	u4_nonzero_coeff);
				776	pi2_out_tmp[8] = r1;
				777
				778	FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16],
				779	pu2_scale_matrix[16], u4_round_factor, u4_qbits,
				780	u4_nonzero_coeff);
				781	pi2_out_tmp[16] = r2;
				782
				783	FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24],
				784	pu2_scale_matrix[24], u4_round_factor, u4_qbits,
				785	u4_nonzero_coeff);
				786	pi2_out_tmp[24] = r3;
				787
				788	FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32],
				789	pu2_scale_matrix[32], u4_round_factor, u4_qbits,
				790	u4_nonzero_coeff);
				791	pi2_out_tmp[32] = r4;
				792
				793	FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40],
				794	pu2_scale_matrix[40], u4_round_factor, u4_qbits,
				795	u4_nonzero_coeff);
				796	pi2_out_tmp[40] = r5;
				797
				798	FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48],
				799	pu2_scale_matrix[48], u4_round_factor, u4_qbits,
				800	u4_nonzero_coeff);
				801	pi2_out_tmp[48] = r6;
				802
				803	FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56],
				804	pu2_scale_matrix[56], u4_round_factor, u4_qbits,
				805	u4_nonzero_coeff);
				806	pi2_out_tmp[56] = r7;
				807
				808	pi2_out_tmp++;
				809	pu2_scale_matrix++;
				810	pu2_threshold_matrix++;
				811	}
				812	/* Return total nonzero coefficients in the current sub block */
				813	*pu1_nnz = u4_nonzero_coeff;
				814	}