Blame - encoder/ih264e_core_coding.c - platform/external/libavc

blob: 5ba18ded057305fe5c251689575d2e5b0b54b183 [file] [log] [blame]

Hamsalekha S	8d3d303	2015-03-13 21:24:58 +0530	[diff] [blame^]	1	/******************************************************************************
				2	*
				3	* Copyright (C) 2015 The Android Open Source Project
				4	*
				5	* Licensed under the Apache License, Version 2.0 (the "License");
				6	* you may not use this file except in compliance with the License.
				7	* You may obtain a copy of the License at:
				8	*
				9	* http://www.apache.org/licenses/LICENSE-2.0
				10	*
				11	* Unless required by applicable law or agreed to in writing, software
				12	* distributed under the License is distributed on an "AS IS" BASIS,
				13	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	* See the License for the specific language governing permissions and
				15	* limitations under the License.
				16	*
				17	*****************************************************************************
				18	* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
				19	*/
				20
				21	/**
				22	*******************************************************************************
				23	* @file
				24	* ih264e_core_coding.c
				25	*
				26	* @brief
				27	* This file contains routines that perform luma and chroma core coding for
				28	* intra macroblocks
				29	*
				30	* @author
				31	* ittiam
				32	*
				33	* @par List of Functions:
				34	* - ih264e_pack_l_mb_i16()
				35	* - ih264e_pack_c_mb_i8()
				36	* - ih264e_code_luma_intra_macroblock_16x16()
				37	* - ih264e_code_luma_intra_macroblock_4x4()
				38	* - ih264e_code_chroma_intra_macroblock_8x8()
				39	*
				40	* @remarks
				41	* None
				42	*
				43	*******************************************************************************
				44	*/
				45
				46	/*****************************************************************************/
				47	/* File Includes */
				48	/*****************************************************************************/
				49
				50	/* System include files */
				51	#include <stdio.h>
				52	#include <string.h>
				53	#include <assert.h>
				54
				55	/* User include files */
				56	#include "ih264e_config.h"
				57	#include "ih264_typedefs.h"
				58	#include "ih264_platform_macros.h"
				59	#include "iv2.h"
				60	#include "ive2.h"
				61	#include "ih264_defs.h"
				62	#include "ih264e_defs.h"
				63	#include "ih264_trans_data.h"
				64	#include "ih264e_error.h"
				65	#include "ih264e_bitstream.h"
				66	#include "ime_distortion_metrics.h"
				67	#include "ime_structs.h"
				68	#include "ih264_structs.h"
				69	#include "ih264_trans_quant_itrans_iquant.h"
				70	#include "ih264_inter_pred_filters.h"
				71	#include "ih264_mem_fns.h"
				72	#include "ih264_padding.h"
				73	#include "ih264_intra_pred_filters.h"
				74	#include "ih264_deblk_edge_filters.h"
				75	#include "irc_cntrl_param.h"
				76	#include "irc_frame_info_collector.h"
				77	#include "ih264e_rate_control.h"
				78	#include "ih264e_structs.h"
				79	#include "ih264e_globals.h"
				80	#include "ih264e_core_coding.h"
				81	#include "ih264e_mc.h"
				82
				83
				84	/*****************************************************************************/
				85	/* Function Definitions */
				86	/*****************************************************************************/
				87
				88	/**
				89	*******************************************************************************
				90	*
				91	* @brief
				92	* This function performs does the DCT transform then Hadamard transform
				93	* and quantization for a macroblock when the mb mode is intra 16x16 mode
				94	*
				95	* @par Description:
				96	* First cf4 is done on all 16 4x4 blocks of the 16x16 input block.
				97	* Then hadamard transform is done on the DC coefficients
				98	* Quantization is then performed on the 16x16 block, 4x4 wise
				99	*
				100	* @param[in] pu1_src
				101	* Pointer to source sub-block
				102	*
				103	* @param[in] pu1_pred
				104	* Pointer to prediction sub-block
				105	*
				106	* @param[in] pi2_out
				107	* Pointer to residual sub-block
				108	* The output will be in linear format
				109	* The first 16 continuous locations will contain the values of Dc block
				110	* After DC block and a stride 1st AC block will follow
				111	* After one more stride next AC block will follow
				112	* The blocks will be in raster scan order
				113	*
				114	* @param[in] src_strd
				115	* Source stride
				116	*
				117	* @param[in] pred_strd
				118	* Prediction stride
				119	*
				120	* @param[in] dst_strd
				121	* Destination stride
				122	*
				123	* @param[in] pu2_scale_matrix
				124	* The quantization matrix for 4x4 transform
				125	*
				126	* @param[in] pu2_threshold_matrix
				127	* Threshold matrix
				128	*
				129	* @param[in] u4_qbits
				130	* 15+QP/6
				131	*
				132	* @param[in] u4_round_factor
				133	* Round factor for quant
				134	*
				135	* @param[out] pu1_nnz
				136	* Memory to store the non-zeros after transform
				137	* The first byte will be the nnz of DC block
				138	* From the next byte the AC nnzs will be stored in raster scan order
				139	*
				140	* @param u4_dc_flag
				141	* Signals if Dc transform is to be done or not
				142	* 1 -> Dc transform will be done
				143	* 0 -> Dc transform will not be done
				144	*
				145	* @remarks
				146	*
				147	*******************************************************************************
				148	*/
				149	void ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t *ps_codec,
				150	UWORD8 *pu1_src,
				151	UWORD8 *pu1_pred,
				152	WORD16 *pi2_out,
				153	WORD32 src_strd,
				154	WORD32 pred_strd,
				155	WORD32 dst_strd,
				156	const UWORD16 *pu2_scale_matrix,
				157	const UWORD16 *pu2_threshold_matrix,
				158	UWORD32 u4_qbits,
				159	UWORD32 u4_round_factor,
				160	UWORD8 *pu1_nnz,
				161	UWORD32 u4_dc_flag)
				162
				163	{
				164	WORD32 blk_cntr;
				165	WORD32 i4_offsetx, i4_offsety;
				166	UWORD8 pu1_curr_src, pu1_curr_pred;
				167
				168	WORD16 *pi2_dc_str = pi2_out;
				169
				170	/* Move to the ac addresses */
				171	pu1_nnz++;
				172	pi2_out += dst_strd;
				173
				174	for (blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++)
				175	{
				176	IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety);
				177
				178	pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
				179	pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
				180
				181	ps_codec->pf_resi_trans_quant_4x4(pu1_curr_src, pu1_curr_pred,
				182	pi2_out + blk_cntr * dst_strd,
				183	src_strd, pred_strd, pu2_scale_matrix,
				184	pu2_threshold_matrix, u4_qbits,
				185	u4_round_factor, &pu1_nnz[blk_cntr],
				186	&pi2_dc_str[blk_cntr]);
				187
				188	}
				189
				190	if (!u4_dc_flag)
				191	return;
				192
				193	/*
				194	* In case of i16x16, we need to remove the contribution of dc coeffs into
				195	* nnz of each block. We are doing that in the packing function
				196	*/
				197
				198	/* Adjust pointers to point to dc values */
				199	pi2_out -= dst_strd;
				200	pu1_nnz--;
				201
				202	u4_qbits++;
				203	u4_round_factor <<= 1;
				204
				205	ps_codec->pf_hadamard_quant_4x4(pi2_dc_str, pi2_out, pu2_scale_matrix,
				206	pu2_threshold_matrix, u4_qbits,
				207	u4_round_factor, &pu1_nnz[0]);
				208	}
				209
				210	/**
				211	*******************************************************************************
				212	*
				213	* @brief
				214	* This function performs the intra 16x16 inverse transform process for H264
				215	* it includes inverse Dc transform, inverse quant and then inverse transform
				216	*
				217	* @par Description:
				218	*
				219	* @param[in] pi2_src
				220	* Input data, 16x16 size
				221	* First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
				222	* after a stride 1st AC clock will be present again in raster can order
				223	* Then each AC block of the 16x16 block will follow in raster scan order
				224	*
				225	* @param[in] pu1_pred
				226	* The predicted data, 16x16 size
				227	* Block by block form
				228	*
				229	* @param[in] pu1_out
				230	* Output 16x16
				231	* In block by block form
				232	*
				233	* @param[in] src_strd
				234	* Source stride
				235	*
				236	* @param[in] pred_strd
				237	* input stride for prediction buffer
				238	*
				239	* @param[in] out_strd
				240	* input stride for output buffer
				241	*
				242	* @param[in] pu2_iscale_mat
				243	* Inverse quantization matrix for 4x4 transform
				244	*
				245	* @param[in] pu2_weigh_mat
				246	* weight matrix of 4x4 transform
				247	*
				248	* @param[in] qp_div
				249	* QP/6
				250	*
				251	* @param[in] pi4_tmp
				252	* Input temporary buffer
				253	* needs to be at least 20 in size
				254	*
				255	* @param[in] pu4_cntrl
				256	* Controls the transform path
				257	* total Last 17 bits are used
				258	* the 16th th bit will correspond to DC block
				259	* and 32-17 will correspond to the ac blocks in raster scan order
				260	* bit equaling zero indicates that the entire 4x4 block is zero for DC
				261	* For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
				262	*
				263	* @param[in] pi4_tmp
				264	* Input temporary buffer
				265	* needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
				266	*
				267	* @returns
				268	* none
				269	*
				270	* @remarks
				271	* The all zero case must be taken care outside
				272	*
				273	*******************************************************************************
				274	*/
				275	void ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t *ps_codec,
				276	WORD16 *pi2_src,
				277	UWORD8 *pu1_pred,
				278	UWORD8 *pu1_out,
				279	WORD32 src_strd,
				280	WORD32 pred_strd,
				281	WORD32 out_strd,
				282	const UWORD16 *pu2_iscale_mat,
				283	const UWORD16 *pu2_weigh_mat,
				284	UWORD32 qp_div,
				285	UWORD32 u4_cntrl,
				286	UWORD32 u4_dc_trans_flag,
				287	WORD32 *pi4_tmp)
				288	{
				289	/* Start index for inverse quant in a 4x4 block */
				290	WORD32 iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1;
				291
				292	/* Cntrl bits for 4x4 transforms
				293	* u4_blk_cntrl : controls if a 4x4 block should be processed in ac path
				294	* u4_dc_cntrl : controls is a 4x4 block is to be processed in dc path
				295	* : dc block must contain only single dc coefficient
				296	* u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
				297	* : ie not (ac or dc)
				298	*/
				299	UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
				300
				301	/* tmp registers for block ids */
				302	UWORD32 u4_blk_id;
				303
				304	/* Subscrripts */
				305	WORD32 i4_offset_x, i4_offset_y;
				306
				307	UWORD8 pu1_cur_prd_blk, pu1_cur_out_blk;
				308
				309	/* Src and stride for dc coeffs */
				310	UWORD32 u4_dc_inc;
				311	WORD16 *pi2_dc_src;
				312
				313	/*
				314	* For intra blocks we need to do inverse dc transform
				315	* In case if intra blocks, its here that we populate the dc bits in cntrl
				316	* as they cannot be populated any earlier
				317	*/
				318	if (u4_dc_trans_flag)
				319	{
				320	UWORD32 cntr, u4_dc_cntrl;
				321	/* Do inv hadamard and place the results at the start of each AC block */
				322	ps_codec->pf_ihadamard_scaling_4x4(pi2_src, pi2_src, pu2_iscale_mat,
				323	pu2_weigh_mat, qp_div, pi4_tmp);
				324
				325	/* Update the cntrl flag */
				326	u4_dc_cntrl = 0;
				327	for (cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++)
				328	{
				329	u4_dc_cntrl \|= ((pi2_src[cntr] != 0) << (15 - cntr));
				330	}
				331	/* Mark dc bits as 1 if corresponding ac bit is 0 */
				332	u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
				333	/* Combine both ac and dc bits */
				334	u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA)
				335	\| (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA);
				336	}
				337
				338	/* Source for dc coeffs
				339	* If the block is intra, we have to read dc values from first row of src
				340	* then stride for each block is 1, other wise its src stride
				341	*/
				342	pi2_dc_src = (iq_start_idx == 0) ? (pi2_src + src_strd) : pi2_src;
				343	u4_dc_inc = (iq_start_idx == 0) ? src_strd : 1;
				344
				345	/* The AC blocks starts from 2nd row */
				346	pi2_src += src_strd;
				347
				348	/* Get the block bits */
				349	u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA);
				350	u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16;
				351	u4_empty_blk_cntrl = (~(u4_dc_cntrl \| u4_blk_cntrl)) & 0xFFFF0000;
				352
				353	/* Get first block to process */
				354	DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
				355	while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
				356	{
				357	/* Compute address of src blocks */
				358	WORD32 i4_src_offset = u4_dc_inc * u4_blk_id;
				359
				360	IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
				361
				362	/* Compute address of out and pred blocks */
				363	pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
				364	pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
				365
				366	/* Do inv dc transform */
				367	ps_codec->pf_iquant_itrans_recon_4x4_dc(pi2_dc_src + i4_src_offset,
				368	pu1_cur_prd_blk,
				369	pu1_cur_out_blk, pred_strd,
				370	out_strd, pu2_iscale_mat,
				371	pu2_weigh_mat, qp_div, NULL,
				372	iq_start_idx,
				373	pi2_dc_src + i4_src_offset);
				374	/* Get next DC block to process */
				375	DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
				376	}
				377
				378	/* now process ac/mixed blocks */
				379	DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
				380	while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
				381	{
				382
				383	WORD32 i4_src_offset = src_strd * u4_blk_id;
				384
				385	IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
				386
				387	pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
				388	pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
				389
				390	ps_codec->pf_iquant_itrans_recon_4x4(pi2_src + i4_src_offset,
				391	pu1_cur_prd_blk, pu1_cur_out_blk,
				392	pred_strd, out_strd,
				393	pu2_iscale_mat, pu2_weigh_mat,
				394	qp_div, (WORD16*) pi4_tmp,
				395	iq_start_idx,
				396	pi2_dc_src + u4_blk_id);
				397
				398	DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
				399	}
				400
				401	/* Now process empty blocks */
				402	DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
				403	while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
				404	{
				405	IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
				406
				407	pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
				408	pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
				409
				410	ps_codec->pf_inter_pred_luma_copy(pu1_cur_prd_blk, pu1_cur_out_blk,
				411	pred_strd, out_strd, SIZE_4X4_BLK_HRZ,
				412	SIZE_4X4_BLK_VERT, 0, 0);
				413
				414	DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
				415	}
				416	}
				417
				418	/**
				419	*******************************************************************************
				420	*
				421	* @brief
				422	* This function performs does the DCT transform then Hadamard transform
				423	* and quantization for a chroma macroblock
				424	*
				425	* @par Description:
				426	* First cf4 is done on all 16 4x4 blocks of the 8x8input block
				427	* Then hadamard transform is done on the DC coefficients
				428	* Quantization is then performed on the 8x8 block, 4x4 wise
				429	*
				430	* @param[in] pu1_src
				431	* Pointer to source sub-block
				432	* The input is in interleaved format for two chroma planes
				433	*
				434	* @param[in] pu1_pred
				435	* Pointer to prediction sub-block
				436	* Prediction is in inter leaved format
				437	*
				438	* @param[in] pi2_out
				439	* Pointer to residual sub-block
				440	* The output will be in linear format
				441	* The first 4 continuous locations will contain the values of DC block for U
				442	* and then next 4 will contain for V.
				443	* After DC block and a stride 1st AC block of U plane will follow
				444	* After one more stride next AC block of V plane will follow
				445	* The blocks will be in raster scan order
				446	*
				447	* After all the AC blocks of U plane AC blocks of V plane will follow in exact
				448	* same way
				449	*
				450	* @param[in] src_strd
				451	* Source stride
				452	*
				453	* @param[in] pred_strd
				454	* Prediction stride
				455	*
				456	* @param[in] dst_strd
				457	* Destination stride
				458	*
				459	* @param[in] pu2_scale_matrix
				460	* The quantization matrix for 4x4 transform
				461	*
				462	* @param[in] pu2_threshold_matrix
				463	* Threshold matrix
				464	*
				465	* @param[in] u4_qbits
				466	* 15+QP/6
				467	*
				468	* @param[in] u4_round_factor
				469	* Round factor for quant
				470	*
				471	* @param[out] pu1_nnz
				472	* Memory to store the non-zeros after transform
				473	* The first byte will be the nnz od DC block for U plane
				474	* From the next byte the AC nnzs will be storerd in raster scan order
				475	* The fifth byte will be nnz of Dc block of V plane
				476	* Then Ac blocks will follow
				477	*
				478	* @param u4_dc_flag
				479	* Signals if Dc transform is to be done or not
				480	* 1 -> Dc transform will be done
				481	* 0 -> Dc transform will not be done
				482	*
				483	* @remarks
				484	*
				485	*******************************************************************************
				486	*/
				487	void ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t *ps_codec,
				488	UWORD8 *pu1_src,
				489	UWORD8 *pu1_pred,
				490	WORD16 *pi2_out,
				491	WORD32 src_strd,
				492	WORD32 pred_strd,
				493	WORD32 out_strd,
				494	const UWORD16 *pu2_scale_matrix,
				495	const UWORD16 *pu2_threshold_matrix,
				496	UWORD32 u4_qbits,
				497	UWORD32 u4_round_factor,
				498	UWORD8 *pu1_nnz_c)
				499	{
				500	WORD32 blk_cntr;
				501	WORD32 i4_offsetx, i4_offsety;
				502	UWORD8 pu1_curr_src, pu1_curr_pred;
				503
				504	WORD16 pi2_dc_str[8];
				505	UWORD8 au1_dcnnz[2];
				506
				507	/* Move to the ac addresses */
				508	pu1_nnz_c++;
				509	pi2_out += out_strd;
				510
				511	for (blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++)
				512	{
				513	IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety);
				514
				515	pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
				516	pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
				517
				518	/* For chroma, v plane nnz is populated from position 5 */
				519	ps_codec->pf_resi_trans_quant_chroma_4x4(
				520	pu1_curr_src, pu1_curr_pred,
				521	pi2_out + blk_cntr * out_strd, src_strd, pred_strd,
				522	pu2_scale_matrix, pu2_threshold_matrix, u4_qbits,
				523	u4_round_factor, &pu1_nnz_c[blk_cntr + (blk_cntr > 3)],
				524	&pi2_dc_str[blk_cntr]);
				525	}
				526
				527	/* Adjust pointers to point to dc values */
				528	pi2_out -= out_strd;
				529	pu1_nnz_c--;
				530
				531	u4_qbits++;
				532	u4_round_factor <<= 1;
				533
				534	ps_codec->pf_hadamard_quant_2x2_uv(pi2_dc_str, pi2_out, pu2_scale_matrix,
				535	pu2_threshold_matrix, u4_qbits,
				536	u4_round_factor, au1_dcnnz);
				537
				538	/* Copy the dc nnzs */
				539	pu1_nnz_c[0] = au1_dcnnz[0];
				540	pu1_nnz_c[5] = au1_dcnnz[1];
				541
				542	}
				543
				544	/**
				545	*******************************************************************************
				546	* @brief
				547	* This function performs the inverse transform with process for chroma MB of H264
				548	*
				549	* @par Description:
				550	* Does inverse DC transform ,inverse quantization inverse transform
				551	*
				552	* @param[in] pi2_src
				553	* Input data, 16x16 size
				554	* The input is in the form of, first 4 locations will contain DC coeffs of
				555	* U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
				556	* in raster scan order will follow, each block as linear array in raster scan order.
				557	* After a stride next AC block will follow. After all AC blocks of U plane
				558	* V plane AC blocks will follow in exact same order.
				559	*
				560	* @param[in] pu1_pred
				561	* The predicted data, 8x16 size, U and V interleaved
				562	*
				563	* @param[in] pu1_out
				564	* Output 8x16, U and V interleaved
				565	*
				566	* @param[in] src_strd
				567	* Source stride
				568	*
				569	* @param[in] pred_strd
				570	* input stride for prediction buffer
				571	*
				572	* @param[in] out_strd
				573	* input stride for output buffer
				574	*
				575	* @param[in] pu2_iscale_mat
				576	* Inverse quantization martix for 4x4 transform
				577	*
				578	* @param[in] pu2_weigh_mat
				579	* weight matrix of 4x4 transform
				580	*
				581	* @param[in] qp_div
				582	* QP/6
				583	*
				584	* @param[in] pi4_tmp
				585	* Input temporary buffer
				586	* needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
				587	* in size
				588	*
				589	* @param[in] pu4_cntrl
				590	* Controls the transform path
				591	* the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
				592	* 32-28 bits will indicate AC blocks of U plane in raster scan order
				593	* 27-23 bits will indicate AC blocks of V plane in rater scan order
				594	* The bit 1 implies that there is at least one non zero coeff in a block
				595	*
				596	* @returns
				597	* none
				598	*
				599	* @remarks
				600	*******************************************************************************
				601	*/
				602	void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t *ps_codec,
				603	WORD16 *pi2_src,
				604	UWORD8 *pu1_pred,
				605	UWORD8 *pu1_out,
				606	WORD32 src_strd,
				607	WORD32 pred_strd,
				608	WORD32 out_strd,
				609	const UWORD16 *pu2_iscale_mat,
				610	const UWORD16 *pu2_weigh_mat,
				611	UWORD32 qp_div,
				612	UWORD32 u4_cntrl,
				613	WORD32 *pi4_tmp)
				614	{
				615	/* Cntrl bits for 4x4 transforms
				616	* u4_blk_cntrl : controls if a 4x4 block should be processed in ac path
				617	* u4_dc_cntrl : controls is a 4x4 block is to be processed in dc path
				618	* : dc block must contain only single dc coefficient
				619	* u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
				620	* : ie not (ac or dc)
				621	*/
				622
				623	UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
				624
				625	/* tmp registers for block ids */
				626	WORD32 u4_blk_id;
				627
				628	/* Offsets for pointers */
				629	WORD32 i4_offset_x, i4_offset_y;
				630
				631	/* Pointer to 4x4 blocks */
				632	UWORD8 pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk;
				633
				634	/* Tmp register for pointer to dc coffs */
				635	WORD16 *pi2_dc_src;
				636
				637	WORD16 i2_zero = 0;
				638
				639	/* Increment for dc block */
				640	WORD32 i4_dc_inc;
				641
				642	/*
				643	* Lets do the inverse transform for dc coeffs in chroma
				644	*/
				645	if (u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA)
				646	{
				647	UWORD32 cntr, u4_dc_cntrl;
				648	/* Do inv hadamard for u an v block */
				649
				650	ps_codec->pf_ihadamard_scaling_2x2_uv(pi2_src, pi2_src, pu2_iscale_mat,
				651	pu2_weigh_mat, qp_div, NULL);
				652	/*
				653	* Update the cntrl flag
				654	* Flag is updated as follows bits 15-11 -> u block dc bits
				655	*/
				656	u4_dc_cntrl = 0;
				657	for (cntr = 0; cntr < 8; cntr++)
				658	{
				659	u4_dc_cntrl \|= ((pi2_src[cntr] != 0) << (15 - cntr));
				660	}
				661
				662	/* Mark dc bits as 1 if corresponding ac bit is 0 */
				663	u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
				664	/* Combine both ac and dc bits */
				665	u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA)
				666	\| (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA);
				667
				668	/* Since we populated the dc coffs, we have to read them from there */
				669	pi2_dc_src = pi2_src;
				670	i4_dc_inc = 1;
				671	}
				672	else
				673	{
				674	u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA;
				675	pi2_dc_src = &i2_zero;
				676	i4_dc_inc = 0;
				677	}
				678
				679	/* Get the block bits */
				680	u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA);
				681	u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16;
				682	u4_empty_blk_cntrl = (~(u4_dc_cntrl \| u4_blk_cntrl)) & 0xFF000000;
				683
				684	/* The AC blocks starts from 2nd row */
				685	pi2_src += src_strd;
				686
				687	DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
				688	while (u4_blk_id < 8)
				689	{
				690	WORD32 dc_src_offset = u4_blk_id * i4_dc_inc;
				691
				692	IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
				693
				694	pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
				695	pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
				696
				697	ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc(
				698	pi2_dc_src + dc_src_offset, pu1_cur_4x4_prd_blk,
				699	pu1_cur_4x4_out_blk, pred_strd, out_strd, NULL, NULL, 0,
				700	NULL, pi2_dc_src + dc_src_offset);
				701	/* Get next DC block to process */
				702	DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
				703	}
				704
				705	/* now process ac/mixed blocks */
				706	DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
				707	while (u4_blk_id < 8)
				708	{
				709	WORD32 i4_src_offset = src_strd * u4_blk_id;
				710	WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
				711
				712	IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
				713
				714	pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
				715	pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
				716
				717	ps_codec->pf_iquant_itrans_recon_chroma_4x4(pi2_src + i4_src_offset,
				718	pu1_cur_4x4_prd_blk,
				719	pu1_cur_4x4_out_blk,
				720	pred_strd, out_strd,
				721	pu2_iscale_mat,
				722	pu2_weigh_mat, qp_div,
				723	(WORD16 *) pi4_tmp,
				724	pi2_dc_src + dc_src_offset);
				725
				726	DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
				727	}
				728
				729	/* Now process empty blocks */
				730	DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
				731	while (u4_blk_id < 8)
				732	{
				733	IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
				734
				735	pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
				736	pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
				737
				738	ps_codec->pf_interleave_copy(pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk,
				739	pred_strd, out_strd, SIZE_4X4_BLK_VERT,
				740	SIZE_4X4_BLK_HRZ);
				741
				742	DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
				743	}
				744	}
				745
				746	/**
				747	******************************************************************************
				748	*
				749	* @brief This function packs residue of an i16x16 luma mb for entropy coding
				750	*
				751	* @par Description
				752	* An i16 macro block contains two classes of units, dc 4x4 block and
				753	* 4x4 ac blocks. while packing the mb, the dc block is sent first, and
				754	* the 16 ac blocks are sent next in scan order. Each and every block is
				755	* represented by 3 parameters (nnz, significant coefficient map and the
				756	* residue coefficients itself). If a 4x4 unit does not have any coefficients
				757	* then only nnz is sent. Inside a 4x4 block the individual coefficients are
				758	* sent in scan order.
				759	*
				760	* The first byte of each block will be nnz of the block, if it is non zero,
				761	* a 2 byte significance map is sent. This is followed by nonzero coefficients.
				762	* This is repeated for 1 dc + 16 ac blocks.
				763	*
				764	* @param[in] pi2_res_mb
				765	* pointer to residue mb
				766	*
				767	* @param[in, out] pv_mb_coeff_data
				768	* buffer pointing to packed residue coefficients
				769	*
				770	* @param[in] u4_res_strd
				771	* residual block stride
				772	*
				773	* @param[out] u1_cbp_l
				774	* coded block pattern luma
				775	*
				776	* @param[in] pu1_nnz
				777	* number of non zero coefficients in each 4x4 unit
				778	*
				779	* @param[out]
				780	* Control signal for inverse transform of 16x16 blocks
				781	*
				782	* @return none
				783	*
				784	* @ remarks
				785	*
				786	******************************************************************************
				787	*/
				788	void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb,
				789	void **pv_mb_coeff_data,
				790	WORD32 i4_res_strd,
				791	UWORD8 *u1_cbp_l,
				792	UWORD8 *pu1_nnz,
				793	UWORD32 *pu4_cntrl)
				794	{
				795	/* pointer to packed sub block buffer space */
				796	tu_sblk_coeff_data_t ps_mb_coeff_data = (pv_mb_coeff_data), *ps_mb_coeff_data_ac;
				797
				798	/* no of non zero coefficients in the current sub block */
				799	UWORD32 u4_nnz_cnt;
				800
				801	/* significant coefficient map */
				802	UWORD32 u4_s_map;
				803
				804	/* pointer to scanning matrix */
				805	const UWORD8 *pu1_scan_order;
				806
				807	/* number of non zeros in sub block */
				808	UWORD32 u4_nnz;
				809
				810	/* coeff scan order */
				811	const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
				812
				813	/* temp var */
				814	UWORD32 coeff_cnt, mask, b4,u4_cntrl=0;
				815
				816	/DC and AC coeff pointers/
				817	WORD16 pi2_res_mb_ac,pi2_res_mb_dc;
				818
				819	/********************************************************/
				820	/* pack dc coeff data for entropy coding */
				821	/********************************************************/
				822
				823	pi2_res_mb_dc = pi2_res_mb;
				824	pu1_scan_order = gu1_luma_scan_order_dc;
				825
				826	u4_nnz = *pu1_nnz;
				827	u4_cntrl = 0;
				828
				829	/* write number of non zero coefficients */
				830	ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
				831
				832	if (u4_nnz)
				833	{
				834	for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
				835	{
				836	if (pi2_res_mb_dc[pu1_scan_order[coeff_cnt]])
				837	{
				838	/* write residue */
				839	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_dc[pu1_scan_order[coeff_cnt]];
				840	u4_s_map \|= mask;
				841	}
				842	mask <<= 1;
				843	}
				844	/* write significant coeff map */
				845	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				846	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				847
				848	u4_cntrl = 0x00008000;// Set DC bit in ctrl code
				849	}
				850	else
				851	{
				852	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				853	}
				854
				855	/********************************************************/
				856	/* pack ac coeff data for entropy coding */
				857	/********************************************************/
				858
				859	pu1_nnz ++;
				860	pu1_scan_order = gu1_luma_scan_order;
				861	pi2_res_mb += i4_res_strd; /Move to AC block/
				862
				863	ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
				864
				865	for (b4 = 0; b4 < 16; b4++)
				866	{
				867	ps_mb_coeff_data = (*pv_mb_coeff_data);
				868
				869	u4_nnz = pu1_nnz[u1_scan_order[b4]];
				870
				871	/* Jump according to the scan order */
				872	pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
				873
				874	/*
				875	* Since this is a i16x16 block, we should not count dc coeff on indi
				876	* vidual 4x4 blocks to nnz. But due to the implementation of 16x16
				877	* trans function, we add dc's nnz to u4_nnz too. Hence we adjust that
				878	* here
				879	*/
				880	u4_nnz -= (pi2_res_mb_ac[0] != 0);
				881
				882	/* write number of non zero coefficients */
				883	ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
				884
				885	if (u4_nnz)
				886	{
				887	for (u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
				888	{
				889	if (pi2_res_mb_ac[pu1_scan_order[coeff_cnt]])
				890	{
				891	/* write residue */
				892	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_ac[pu1_scan_order[coeff_cnt]];
				893	u4_s_map \|= mask;
				894	}
				895	mask <<= 1;
				896	}
				897	/* write significant coeff map */
				898	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				899	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				900	*u1_cbp_l = 15;
				901
				902	u4_cntrl \|= (1 << (31 - u1_scan_order[b4]));
				903	}
				904	else
				905	{
				906	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				907	}
				908
				909	}
				910
				911	if (!(*u1_cbp_l))
				912	{
				913	(*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
				914	}
				915
				916	/* Store the cntrl signal */
				917	(*pu4_cntrl) = u4_cntrl;
				918	return;
				919	}
				920
				921	/**
				922	******************************************************************************
				923	*
				924	* @brief This function packs residue of an p16x16 luma mb for entropy coding
				925	*
				926	* @par Description
				927	* A p16x16 macro block contains two classes of units 16 4x4 ac blocks.
				928	* while packing the mb, the dc block is sent first, and
				929	* the 16 ac blocks are sent next in scan order. Each and every block is
				930	* represented by 3 parameters (nnz, significant coefficient map and the
				931	* residue coefficients itself). If a 4x4 unit does not have any coefficients
				932	* then only nnz is sent. Inside a 4x4 block the individual coefficients are
				933	* sent in scan order.
				934	*
				935	* The first byte of each block will be nnz of the block, if it is non zero,
				936	* a 2 byte significance map is sent. This is followed by nonzero coefficients.
				937	* This is repeated for 1 dc + 16 ac blocks.
				938	*
				939	* @param[in] pi2_res_mb
				940	* pointer to residue mb
				941	*
				942	* @param[in, out] pv_mb_coeff_data
				943	* buffer pointing to packed residue coefficients
				944	*
				945	* @param[in] i4_res_strd
				946	* residual block stride
				947	*
				948	* @param[out] u1_cbp_l
				949	* coded block pattern luma
				950	*
				951	* @param[in] pu1_nnz
				952	* number of non zero coefficients in each 4x4 unit
				953	*
				954	* @param[out] pu4_cntrl
				955	* Control signal for inverse transform
				956	*
				957	* @return none
				958	*
				959	* @remarks Killing coffs not yet coded
				960	*
				961	******************************************************************************
				962	*/
				963	void ih264e_pack_l_mb(WORD16 *pi2_res_mb,
				964	void **pv_mb_coeff_data,
				965	WORD32 i4_res_strd,
				966	UWORD8 *u1_cbp_l,
				967	UWORD8 *pu1_nnz,
				968	UWORD32 u4_thres_resi,
				969	UWORD32 *pu4_cntrl)
				970	{
				971	/* pointer to packed sub block buffer space */
				972	tu_sblk_coeff_data_t ps_mb_coeff_data, ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb;
				973
				974	/* no of non zero coefficients in the current sub block */
				975	UWORD32 u4_nnz_cnt;
				976
				977	/* significant coefficient map */
				978	UWORD32 u4_s_map;
				979
				980	/* pointer to scanning matrix */
				981	const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
				982
				983	/* number of non zeros in sub block */
				984	UWORD32 u4_nnz;
				985
				986	/* pointer to residual sub block */
				987	WORD16 *pi2_res_sb;
				988
				989	/* coeff scan order */
				990	const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
				991
				992	/* coeff cost */
				993	const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
				994
				995	/* temp var */
				996	UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8;
				997
				998	/* temp var */
				999	WORD32 i4_res_val, i4_run = -1, dcac_block;
				1000
				1001	/* When Hadamard transform is disabled, first row values are dont care, ignore them */
				1002	pi2_res_mb += i4_res_strd;
				1003
				1004	/* When Hadamard transform is disabled, first unit value is dont care, ignore this */
				1005	pu1_nnz ++;
				1006
				1007	ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
				1008
				1009	/********************************************************/
				1010	/* pack coeff data for entropy coding */
				1011	/********************************************************/
				1012
				1013	for (b4 = 0; b4 < 16; b4++)
				1014	{
				1015	ps_mb_coeff_data = (*pv_mb_coeff_data);
				1016
				1017	b8 = b4 >> 2;
				1018
				1019	u4_nnz = pu1_nnz[u1_scan_order[b4]];
				1020
				1021	/* Jump according to the scan order */
				1022	pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
				1023
				1024	/* write number of non zero coefficients */
				1025	ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
				1026
				1027	if (u4_nnz)
				1028	{
				1029	for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
				1030	{
				1031	/* number of runs of zero before, this is used to compute coeff cost */
				1032	i4_run++;
				1033
				1034	i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
				1035
				1036	if (i4_res_val)
				1037	{
				1038	/* write residue */
				1039	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val;
				1040	u4_s_map \|= mask;
				1041
				1042	if (u4_thres_resi)
				1043	{
				1044	/* compute coeff cost */
				1045	if (i4_res_val == 1 \|\| i4_res_val == -1)
				1046	{
				1047	if (i4_run < 6)
				1048	u4_b8_coeff_cost += pu1_coeff_cost[i4_run];
				1049	}
				1050	else
				1051	u4_b8_coeff_cost += 9;
				1052
				1053	i4_run = -1;
				1054	}
				1055	}
				1056
				1057	mask <<= 1;
				1058	}
				1059
				1060	/* write significant coeff map */
				1061	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				1062	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				1063
				1064	/* cbp */
				1065	*u1_cbp_l \|= (1 << b8);
				1066
				1067	/* Cntrl map for inverse transform computation
				1068	*
				1069	* If coeff_cnt is zero, it means that only nonzero was a dc coeff
				1070	* Hence we have to set the 16 - u1_scan_order[b4]) position instead
				1071	* of 31 - u1_scan_order[b4]
				1072	*/
				1073	dcac_block = (coeff_cnt == 0)?16:31;
				1074	u4_cntrl \|= (1 << (dcac_block - u1_scan_order[b4]));
				1075	}
				1076	else
				1077	{
				1078	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				1079	}
				1080
				1081	/* Decide if the 8x8 unit has to be sent for entropy coding? */
				1082	if ((b4+1) % 4 == 0)
				1083	{
				1084	if ( u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) &&
				1085	(*u1_cbp_l & (1 << b8)) )
				1086	{
				1087
				1088
				1089	/*
				1090	* When we want to reset the full 8x8 block, we have to reset
				1091	* both the dc and ac coeff bits hence we have the symmetric
				1092	* arrangement of bits
				1093	*/
				1094	const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033};
				1095
				1096	/* restore cbp */
				1097	u1_cbp_l = (u1_cbp_l & (~(1 << b8)));
				1098
				1099	/* correct cntrl flag */
				1100	u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]);
				1101
				1102	/* correct nnz */
				1103	pu1_nnz[u1_scan_order[b4 - 3]] = 0;
				1104	pu1_nnz[u1_scan_order[b4 - 2]] = 0;
				1105	pu1_nnz[u1_scan_order[b4 - 1]] = 0;
				1106	pu1_nnz[u1_scan_order[b4]] = 0;
				1107
				1108	/* reset blk cost */
				1109	u4_b8_coeff_cost = 0;
				1110	}
				1111
				1112	if (!(*u1_cbp_l & (1 << b8)))
				1113	{
				1114	(*pv_mb_coeff_data) = ps_mb_coeff_data_b8;
				1115	}
				1116
				1117	u4_mb_coeff_cost += u4_b8_coeff_cost;
				1118
				1119	u4_b8_coeff_cost = 0;
				1120	i4_run = -1;
				1121	ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
				1122	}
				1123	}
				1124
				1125	if (u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD)
				1126	&& (*u1_cbp_l))
				1127	{
				1128	(*pv_mb_coeff_data) = ps_mb_coeff_data_mb;
				1129	*u1_cbp_l = 0;
				1130	u4_cntrl = 0;
				1131	memset(pu1_nnz, 0, 16);
				1132	}
				1133
				1134	(*pu4_cntrl) = u4_cntrl;
				1135
				1136	return;
				1137	}
				1138
				1139	/**
				1140	******************************************************************************
				1141	*
				1142	* @brief This function packs residue of an i8x8 chroma mb for entropy coding
				1143	*
				1144	* @par Description
				1145	* An i8 chroma macro block contains two classes of units, dc 2x2 block and
				1146	* 4x4 ac blocks. while packing the mb, the dc block is sent first, and
				1147	* the 4 ac blocks are sent next in scan order. Each and every block is
				1148	* represented by 3 parameters (nnz, significant coefficient map and the
				1149	* residue coefficients itself). If a 4x4 unit does not have any coefficients
				1150	* then only nnz is sent. Inside a 4x4 block the individual coefficients are
				1151	* sent in scan order.
				1152	*
				1153	* The first byte of each block will be nnz of the block, if it is non zero,
				1154	* a 2 byte significance map is sent. This is followed by nonzero coefficients.
				1155	* This is repeated for 1 dc + 4 ac blocks.
				1156	*
				1157	* @param[in] pi2_res_mb
				1158	* pointer to residue mb
				1159	*
				1160	* @param[in, out] pv_mb_coeff_data
				1161	* buffer pointing to packed residue coefficients
				1162	*
				1163	* @param[in] u4_res_strd
				1164	* residual block stride
				1165	*
				1166	* @param[out] u1_cbp_c
				1167	* coded block pattern chroma
				1168	*
				1169	* @param[in] pu1_nnz
				1170	* number of non zero coefficients in each 4x4 unit
				1171	*
				1172	* @param[out] pu1_nnz
				1173	* Control signal for inverse transform
				1174	*
				1175	* @param[in] u4_swap_uv
				1176	* Swaps the order of U and V planes in entropy bitstream
				1177	*
				1178	* @return none
				1179	*
				1180	* @ remarks
				1181	*
				1182	******************************************************************************
				1183	*/
				1184	void ih264e_pack_c_mb(WORD16 *pi2_res_mb,
				1185	void **pv_mb_coeff_data,
				1186	WORD32 i4_res_strd,
				1187	UWORD8 *u1_cbp_c,
				1188	UWORD8 *pu1_nnz,
				1189	UWORD32 u4_thres_resi,
				1190	UWORD32 *pu4_cntrl,
				1191	UWORD32 u4_swap_uv)
				1192	{
				1193	/* pointer to packed sub block buffer space */
				1194	tu_sblk_coeff_data_t ps_mb_coeff_data = (pv_mb_coeff_data);
				1195	tu_sblk_coeff_data_t ps_mb_coeff_data_dc, ps_mb_coeff_data_ac;
				1196
				1197	/* nnz pointer */
				1198	UWORD8 pu1_nnz_ac, pu1_nnz_dc;
				1199
				1200	/* nnz counter */
				1201	UWORD32 u4_nnz_cnt;
				1202
				1203	/* significant coefficient map */
				1204	UWORD32 u4_s_map;
				1205
				1206	/* pointer to scanning matrix */
				1207	const UWORD8 *pu1_scan_order;
				1208
				1209	/* no of non zero coefficients in the current sub block */
				1210	UWORD32 u4_nnz;
				1211
				1212	/* pointer to residual sub block, res val */
				1213	WORD16 *pi2_res_sb, i2_res_val;
				1214
				1215	/* temp var */
				1216	UWORD32 coeff_cnt, mask, b4,plane;
				1217
				1218	/* temp var */
				1219	UWORD32 u4_coeff_cost;
				1220	WORD32 i4_run;
				1221
				1222	/* coeff cost */
				1223	const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
				1224
				1225	/* pointer to packed buffer space */
				1226	UWORD32 *pu4_mb_coeff_data = NULL;
				1227
				1228	/* ac coded block pattern */
				1229	UWORD8 u1_cbp_ac;
				1230
				1231	/* Variable to store the current bit pos in cntrl variable*/
				1232	UWORD32 cntrl_pos = 0;
				1233
				1234	/********************************************************/
				1235	/* pack dc coeff data for entropy coding */
				1236	/********************************************************/
				1237	pu1_scan_order = gu1_chroma_scan_order_dc;
				1238	pi2_res_sb = pi2_res_mb;
				1239	pu1_nnz_dc = pu1_nnz;
				1240	(*pu4_cntrl) = 0;
				1241	cntrl_pos = 15;
				1242	ps_mb_coeff_data_dc = (*pv_mb_coeff_data);
				1243
				1244	/* Color space conversion between SP_UV and SP_VU
				1245	* We always assume SP_UV for all the processing
				1246	* Hence to get proper stream output we need to swap U and V channels here
				1247	*
				1248	* For that there are two paths we need to look for
				1249	* One is the path to bitstream , these variables should have the proper input
				1250	* configured UV or VU
				1251	* For the other path the inverse transform variables should have ehat ever 0ordering the
				1252	* input had
				1253	*/
				1254
				1255	if (u4_swap_uv)
				1256	{
				1257	pu1_nnz_dc += 5;/* Move to NNZ of V planve */
				1258	pi2_res_sb += 4;/* Move to DC coff of V plane */
				1259
				1260	cntrl_pos = 14; /* Control bit for V plane */
				1261	}
				1262
				1263	for (plane = 0; plane < 2; plane++)
				1264	{
				1265	ps_mb_coeff_data = (*pv_mb_coeff_data);
				1266
				1267	u4_nnz = *pu1_nnz_dc;
				1268	/* write number of non zero coefficients U/V */
				1269	ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
				1270
				1271	if (u4_nnz)
				1272	{
				1273	for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
				1274	{
				1275	i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
				1276	if (i2_res_val)
				1277	{
				1278	/* write residue U/V */
				1279	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
				1280	u4_s_map \|= mask;
				1281	}
				1282	mask <<= 1;
				1283	}
				1284	/* write significant coeff map U/V */
				1285	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				1286	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				1287	*u1_cbp_c = 1;
				1288
				1289	(*pu4_cntrl) \|= (1 << cntrl_pos);
				1290	}
				1291	else
				1292	{
				1293	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				1294	}
				1295
				1296	if (u4_swap_uv)
				1297	{
				1298	cntrl_pos++; /* Control bit for U plane */
				1299	pu1_nnz_dc -= 5; /* Move to NNZ of U plane */
				1300	pi2_res_sb -= 4; /* Move to DC coff of U plane */
				1301
				1302	}
				1303	else
				1304	{
				1305	cntrl_pos--; /* Control bit for U plane */
				1306	pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */
				1307	pi2_res_sb += 4; /* Move to DC coff of V plane */
				1308	}
				1309	}
				1310
				1311	/********************************************************/
				1312	/* pack ac coeff data for entropy coding */
				1313	/********************************************************/
				1314
				1315	pu1_scan_order = gu1_chroma_scan_order;
				1316	ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
				1317
				1318	if (u4_swap_uv)
				1319	{
				1320	pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */
				1321	cntrl_pos = 27; /* The control bits are to be added for V bloc ie 31-4 th bit */
				1322	pu1_nnz_ac = pu1_nnz + 6;/Move the nnz to V block NNZ 1 dc + 1dc + 4 ac /
				1323	}
				1324	else
				1325	{
				1326	pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */
				1327	cntrl_pos = 31;
				1328	pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */
				1329	}
				1330
				1331	for (plane = 0; plane < 2; plane++)
				1332	{
				1333	pu4_mb_coeff_data = (*pv_mb_coeff_data);
				1334
				1335	u4_coeff_cost = 0;
				1336	i4_run = -1;
				1337
				1338	/* get the current cbp, so that it automatically
				1339	* gets reverted in case of zero ac values */
				1340	u1_cbp_ac = *u1_cbp_c;
				1341
				1342	for (b4 = 0; b4 < 4; b4++)
				1343	{
				1344	ps_mb_coeff_data = (*pv_mb_coeff_data);
				1345
				1346	u4_nnz = *pu1_nnz_ac;
				1347
				1348	/*
				1349	* We are scanning only ac coeffs, but the nnz is for the
				1350	* complete 4x4 block. Hence we have to discount the nnz contributed
				1351	* by the dc coefficient
				1352	*/
				1353	u4_nnz -= (pi2_res_sb[0]!=0);
				1354
				1355	/* write number of non zero coefficients U/V */
				1356	ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
				1357
				1358	if (u4_nnz)
				1359	{
				1360	for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
				1361	{
				1362	i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
				1363
				1364	i4_run++;
				1365
				1366	if (i2_res_val)
				1367	{
				1368	/* write residue U/V */
				1369	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
				1370	u4_s_map \|= mask;
				1371
				1372	if ( u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD) )
				1373	{
				1374	/* compute coeff cost */
				1375	if (i2_res_val == 1 \|\| i2_res_val == -1)
				1376	{
				1377	if (i4_run < 6)
				1378	u4_coeff_cost += pu1_coeff_cost[i4_run];
				1379	}
				1380	else
				1381	u4_coeff_cost += 9;
				1382
				1383	i4_run = -1;
				1384	}
				1385	}
				1386	mask <<= 1;
				1387	}
				1388
				1389	/* write significant coeff map U/V */
				1390	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				1391	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				1392	u1_cbp_ac = 2;
				1393
				1394	(*pu4_cntrl) \|= 1 << cntrl_pos;
				1395	}
				1396	else
				1397	{
				1398	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				1399	}
				1400
				1401	pu1_nnz_ac++;
				1402	pi2_res_sb += i4_res_strd;
				1403	cntrl_pos--;
				1404	}
				1405
				1406	/* reset block */
				1407	if (u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
				1408	{
				1409	pu4_mb_coeff_data[0] = 0;
				1410	pu4_mb_coeff_data[1] = 0;
				1411	pu4_mb_coeff_data[2] = 0;
				1412	pu4_mb_coeff_data[3] = 0;
				1413	(*pv_mb_coeff_data) = pu4_mb_coeff_data + 4;
				1414
				1415	/* Generate the control signal */
				1416	/* Zero out the current plane's AC coefficients */
				1417	(*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF);
				1418
				1419	/* Similarly do for the NNZ also */
				1420	*(pu1_nnz_ac - 4) = 0;
				1421	*(pu1_nnz_ac - 3) = 0;
				1422	*(pu1_nnz_ac - 2) = 0;
				1423	*(pu1_nnz_ac - 1) = 0;
				1424	}
				1425	else
				1426	{
				1427	*u1_cbp_c = u1_cbp_ac;
				1428	}
				1429
				1430	if (u4_swap_uv)
				1431	{
				1432	pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */
				1433	cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */
				1434	pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
				1435
				1436	pu1_nnz_ac = pu1_nnz + 1;
				1437	}
				1438	else
				1439	pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */
				1440	}
				1441
				1442	/* restore the ptr basing on cbp */
				1443	if (*u1_cbp_c == 0)
				1444	{
				1445	(*pv_mb_coeff_data) = ps_mb_coeff_data_dc;
				1446	}
				1447	else if (*u1_cbp_c == 1)
				1448	{
				1449	(*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
				1450	}
				1451
				1452	return ;
				1453	}
				1454
				1455	/**
				1456	*******************************************************************************
				1457	*
				1458	* @brief performs luma core coding when intra mode is i16x16
				1459	*
				1460	* @par Description:
				1461	* If the current mb is to be coded as intra of mb type i16x16, the mb is first
				1462	* predicted using one of i16x16 prediction filters, basing on the intra mode
				1463	* chosen. Then, error is computed between the input blk and the estimated blk.
				1464	* This error is transformed (hierarchical transform i.e., dct followed by hada-
				1465	* -mard), quantized. The quantized coefficients are packed in scan order for
				1466	* entropy coding.
				1467	*
				1468	* @param[in] ps_proc_ctxt
				1469	* pointer to the current macro block context
				1470	*
				1471	* @returns u1_cbp_l
				1472	* coded block pattern luma
				1473	*
				1474	* @remarks none
				1475	*
				1476	*******************************************************************************
				1477	*/
				1478
				1479	UWORD8 ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t *ps_proc)
				1480	{
				1481	/* Codec Context */
				1482	codec_t *ps_codec = ps_proc->ps_codec;
				1483
				1484	/* pointer to ref macro block */
				1485	UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
				1486
				1487	/* pointer to src macro block */
				1488	UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
				1489
				1490	/* pointer to prediction macro block */
				1491	UWORD8 *pu1_pred_mb = NULL;
				1492
				1493	/* pointer to residual macro block */
				1494	WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
				1495
				1496	/* strides */
				1497	WORD32 i4_src_strd = ps_proc->i4_src_strd;
				1498	WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
				1499	WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
				1500	WORD32 i4_res_strd = ps_proc->i4_res_strd;
				1501
				1502	/* intra mode */
				1503	UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
				1504
				1505	/* coded block pattern */
				1506	UWORD8 u1_cbp_l = 0;
				1507
				1508	/* number of non zero coeffs*/
				1509	UWORD32 au4_nnz[5];
				1510	UWORD8 pu1_nnz = (UWORD8 )au4_nnz;
				1511
				1512	/Cntrol signal for itrans/
				1513	UWORD32 u4_cntrl;
				1514
				1515	/* quantization parameters */
				1516	quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
				1517
				1518	/* pointer to packed mb coeff data */
				1519	void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
				1520
				1521	/* init nnz */
				1522	au4_nnz[0] = 0;
				1523	au4_nnz[1] = 0;
				1524	au4_nnz[2] = 0;
				1525	au4_nnz[3] = 0;
				1526	au4_nnz[4] = 0;
				1527
				1528	if (u1_intra_mode == PLANE_I16x16)
				1529	{
				1530	pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane;
				1531	}
				1532	else
				1533	{
				1534	pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16;
				1535	}
				1536
				1537	/********************************************************/
				1538	/* error estimation, */
				1539	/* transform */
				1540	/* quantization */
				1541	/********************************************************/
				1542	ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
				1543	pu1_pred_mb, pi2_res_mb,
				1544	i4_src_strd, i4_pred_strd,
				1545	i4_res_strd,
				1546	ps_qp_params->pu2_scale_mat,
				1547	ps_qp_params->pu2_thres_mat,
				1548	ps_qp_params->u1_qbits,
				1549	ps_qp_params->u4_dead_zone,
				1550	pu1_nnz, ENABLE_DC_TRANSFORM);
				1551
				1552	/********************************************************/
				1553	/* pack coeff data for entropy coding */
				1554	/********************************************************/
				1555	ih264e_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
				1556	pu1_nnz, &u4_cntrl);
				1557
				1558	/********************************************************/
				1559	/* ierror estimation, */
				1560	/* itransform */
				1561	/* iquantization */
				1562	/********************************************************/
				1563	/*
				1564	*if refernce frame is not to be computed
				1565	*we only need the right and bottom border 4x4 blocks to predict next intra
				1566	*blocks, hence only compute them
				1567	*/
				1568	if (!ps_proc->u4_compute_recon)
				1569	{
				1570	u4_cntrl &= 0x111F8000;
				1571	}
				1572
				1573	if (u4_cntrl)
				1574	{
				1575	ih264e_luma_16x16_idctrans_iquant_itrans_recon(
				1576	ps_codec, pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
				1577	i4_res_strd, i4_pred_strd, i4_rec_strd,
				1578	ps_qp_params->pu2_iscale_mat,
				1579	ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
				1580	u4_cntrl, ENABLE_DC_TRANSFORM,
				1581	ps_proc->pv_scratch_buff);
				1582	}
				1583	else
				1584	{
				1585	ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, i4_pred_strd,
				1586	i4_rec_strd, MB_SIZE, MB_SIZE, NULL,
				1587	0);
				1588	}
				1589
				1590	return (u1_cbp_l);
				1591	}
				1592
				1593
				1594	/**
				1595	*******************************************************************************
				1596	*
				1597	* @brief performs luma core coding when intra mode is i4x4
				1598	*
				1599	* @par Description:
				1600	* If the current mb is to be coded as intra of mb type i4x4, the mb is first
				1601	* predicted using one of i4x4 prediction filters, basing on the intra mode
				1602	* chosen. Then, error is computed between the input blk and the estimated blk.
				1603	* This error is dct transformed and quantized. The quantized coefficients are
				1604	* packed in scan order for entropy coding.
				1605	*
				1606	* @param[in] ps_proc_ctxt
				1607	* pointer to the current macro block context
				1608	*
				1609	* @returns u1_cbp_l
				1610	* coded block pattern luma
				1611	*
				1612	* @remarks
				1613	* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
				1614	* mentioned in h.264 specification
				1615	*
				1616	*******************************************************************************
				1617	*/
				1618	UWORD8 ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t *ps_proc)
				1619	{
				1620	/* Codec Context */
				1621	codec_t *ps_codec = ps_proc->ps_codec;
				1622
				1623	/* pointer to ref macro block */
				1624	UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
				1625
				1626	/* pointer to src macro block */
				1627	UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
				1628
				1629	/* pointer to prediction macro block */
				1630	UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
				1631
				1632	/* pointer to residual macro block */
				1633	WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
				1634
				1635	/* strides */
				1636	WORD32 i4_src_strd = ps_proc->i4_src_strd;
				1637	WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
				1638	WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
				1639
				1640	/* pointer to neighbors: left, top, top-left */
				1641	UWORD8 *pu1_mb_a;
				1642	UWORD8 *pu1_mb_b;
				1643	UWORD8 *pu1_mb_c;
				1644	UWORD8 *pu1_mb_d;
				1645
				1646	/* intra mode */
				1647	UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
				1648
				1649	/* neighbor availability */
				1650	WORD32 i4_ngbr_avbl;
				1651
				1652	/* neighbor pels for intra prediction */
				1653	UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
				1654
				1655	/* coded block pattern */
				1656	UWORD8 u1_cbp_l = 0;
				1657
				1658	/* number of non zero coeffs*/
				1659	UWORD8 u1_nnz;
				1660
				1661	/* quantization parameters */
				1662	quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
				1663
				1664	/* pointer to packed mb coeff data */
				1665	void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
				1666
				1667	/* pointer to packed mb coeff data */
				1668	tu_sblk_coeff_data_t ps_mb_coeff_data, ps_mb_coeff_data_b8;
				1669
				1670	/* no of non zero coefficients in the current sub block */
				1671	UWORD32 u4_nnz_cnt;
				1672
				1673	/* significant coefficient map */
				1674	UWORD32 u4_s_map;
				1675
				1676	/* pointer to scanning matrix */
				1677	const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
				1678
				1679	/Dummy variable for 4x4 trans fucntion/
				1680	WORD16 i2_dc_dummy;
				1681
				1682	/* temp var */
				1683	UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask;
				1684
				1685	/* Process 16 4x4 lum sub-blocks of the MB in scan order */
				1686	for (b8 = 0; b8 < 4; b8++)
				1687	{
				1688	u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3;
				1689	u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3;
				1690
				1691	/* if in case cbp for the 8x8 block is zero, send no residue */
				1692	ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
				1693
				1694	for (b4 = 0; b4 < 4; b4++)
				1695	{
				1696	/* index of pel in MB */
				1697	u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2);
				1698	u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2);
				1699
				1700	/* Initialize source and reference pointers */
				1701	pu1_curr_mb = ps_proc->pu1_src_buf_luma + u1_pix_x + (u1_pix_y * i4_src_strd);
				1702	pu1_ref_mb = ps_proc->pu1_rec_buf_luma + u1_pix_x + (u1_pix_y * i4_rec_strd);
				1703
				1704	/* pointer to left of ref macro block */
				1705	pu1_mb_a = pu1_ref_mb - 1;
				1706	/* pointer to top of ref macro block */
				1707	pu1_mb_b = pu1_ref_mb - i4_rec_strd;
				1708	/* pointer to topright of ref macro block */
				1709	pu1_mb_c = pu1_mb_b + 4;
				1710	/* pointer to topleft macro block */
				1711	pu1_mb_d = pu1_mb_b - 1;
				1712
				1713	/* compute neighbor availability */
				1714	i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
				1715
				1716	/* sub block intra mode */
				1717	u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4];
				1718
				1719	/********************************************************/
				1720	/* gather prediction pels from neighbors for prediction */
				1721	/********************************************************/
				1722	/* left pels */
				1723	if (i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK)
				1724	{
				1725	for (i = 0; i < 4; i++)
				1726	pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * i4_rec_strd];
				1727	}
				1728	else
				1729	{
				1730	memset(pu1_ngbr_pels_i4, 0, 4);
				1731	}
				1732
				1733	/* top pels */
				1734	if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
				1735	{
				1736	memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
				1737	}
				1738	else
				1739	{
				1740	memset(pu1_ngbr_pels_i4 + 5, 0, 4);
				1741	}
				1742	/* top left pels */
				1743	if (i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK)
				1744	{
				1745	pu1_ngbr_pels_i4[4] = *pu1_mb_d;
				1746	}
				1747	else
				1748	{
				1749	pu1_ngbr_pels_i4[4] = 0;
				1750	}
				1751	/* top right pels */
				1752	if (i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK)
				1753	{
				1754	memcpy(pu1_ngbr_pels_i4+8+1,pu1_mb_c,4);
				1755	}
				1756	else if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
				1757	{
				1758	memset(pu1_ngbr_pels_i4+8+1,pu1_ngbr_pels_i4[8],4);
				1759	}
				1760
				1761	/********************************************************/
				1762	/* prediction */
				1763	/********************************************************/
				1764	(ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4,
				1765	pu1_pred_mb, 0,
				1766	i4_pred_strd,
				1767	i4_ngbr_avbl);
				1768
				1769	/********************************************************/
				1770	/* error estimation, */
				1771	/* transform */
				1772	/* quantization */
				1773	/********************************************************/
				1774	ps_codec->pf_resi_trans_quant_4x4(pu1_curr_mb, pu1_pred_mb,
				1775	pi2_res_mb, i4_src_strd,
				1776	i4_pred_strd,
				1777	ps_qp_params->pu2_scale_mat,
				1778	ps_qp_params->pu2_thres_mat,
				1779	ps_qp_params->u1_qbits,
				1780	ps_qp_params->u4_dead_zone,
				1781	&u1_nnz, &i2_dc_dummy);
				1782
				1783	/********************************************************/
				1784	/* pack coeff data for entropy coding */
				1785	/********************************************************/
				1786	ps_mb_coeff_data = *pv_mb_coeff_data;
				1787
				1788	/* write number of non zero coefficients */
				1789	ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz;
				1790
				1791	if (u1_nnz)
				1792	{
				1793	for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz; coeff_cnt++)
				1794	{
				1795	if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
				1796	{
				1797	/* write residue */
				1798	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
				1799	u4_s_map \|= mask;
				1800	}
				1801	mask <<= 1;
				1802	}
				1803	/* write significant coeff map */
				1804	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				1805
				1806	/* update ptr to coeff data */
				1807	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				1808
				1809	/* cbp */
				1810	u1_cbp_l \|= (1 << b8);
				1811	}
				1812	else
				1813	{
				1814	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				1815	}
				1816
				1817	/********************************************************/
				1818	/* ierror estimation, */
				1819	/* itransform */
				1820	/* iquantization */
				1821	/********************************************************/
				1822	/* If the frame is not to be used for P frame reference or dumping recon
				1823	* we only will use the recon for only predicting intra Mbs
				1824	* This will need only right and bottom edge 4x4 blocks recon
				1825	* Hence we selectively enable them
				1826	*/
				1827	if (ps_proc->u4_compute_recon \|\| (0xF888 & (1 << ((b8 << 2) + b4))))
				1828	{
				1829	if (u1_nnz)
				1830	ps_codec->pf_iquant_itrans_recon_4x4(
				1831	pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
				1832	/No input stride,/i4_pred_strd,
				1833	i4_rec_strd, ps_qp_params->pu2_iscale_mat,
				1834	ps_qp_params->pu2_weigh_mat,
				1835	ps_qp_params->u1_qp_div,
				1836	ps_proc->pv_scratch_buff, 0, 0);
				1837	else
				1838	ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb,
				1839	i4_pred_strd, i4_rec_strd,
				1840	BLK_SIZE, BLK_SIZE, NULL,
				1841	0);
				1842	}
				1843
				1844	}
				1845
				1846	/* if the 8x8 block has no residue, nothing needs to be sent to entropy */
				1847	if (!(u1_cbp_l & (1 << b8)))
				1848	{
				1849	*pv_mb_coeff_data = ps_mb_coeff_data_b8;
				1850	}
				1851	}
				1852
				1853	return (u1_cbp_l);
				1854	}
				1855
				1856	/**
				1857	*******************************************************************************
				1858	*
				1859	* @brief performs luma core coding when intra mode is i4x4
				1860	*
				1861	* @par Description:
				1862	* If the current mb is to be coded as intra of mb type i4x4, the mb is first
				1863	* predicted using one of i4x4 prediction filters, basing on the intra mode
				1864	* chosen. Then, error is computed between the input blk and the estimated blk.
				1865	* This error is dct transformed and quantized. The quantized coefficients are
				1866	* packed in scan order for entropy coding.
				1867	*
				1868	* @param[in] ps_proc_ctxt
				1869	* pointer to the current macro block context
				1870	*
				1871	* @returns u1_cbp_l
				1872	* coded block pattern luma
				1873	*
				1874	* @remarks
				1875	* The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
				1876	* mentioned in h.264 specification
				1877	*
				1878	*******************************************************************************
				1879	*/
				1880	UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t *ps_proc)
				1881	{
				1882	/* Codec Context */
				1883	codec_t *ps_codec = ps_proc->ps_codec;
				1884
				1885	/* pointer to ref macro block */
				1886	UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4;
				1887
				1888	/* pointer to recon buffer */
				1889	UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
				1890
				1891	/* pointer to residual macro block */
				1892	WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
				1893
				1894	/* strides */
				1895	WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
				1896
				1897	/* number of non zero coeffs*/
				1898	UWORD8 pu1_nnz = (UWORD8 )ps_proc->au4_nnz_intra_4x4;
				1899
				1900	/* coded block pattern */
				1901	UWORD8 u1_cbp_l = 0;
				1902
				1903	/* pointer to packed mb coeff data */
				1904	void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
				1905
				1906	/* pointer to packed mb coeff data */
				1907	tu_sblk_coeff_data_t ps_mb_coeff_data, ps_mb_coeff_data_b8;
				1908
				1909	/* no of non zero coefficients in the current sub block */
				1910	UWORD32 u4_nnz_cnt;
				1911
				1912	/* significant coefficient map */
				1913	UWORD32 u4_s_map;
				1914
				1915	/* pointer to scanning matrix */
				1916	const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
				1917
				1918	/* temp var */
				1919	UWORD32 b8, b4, coeff_cnt, mask;
				1920
				1921	/* Process 16 4x4 lum sub-blocks of the MB in scan order */
				1922	for (b8 = 0; b8 < 4; b8++)
				1923	{
				1924	/* if in case cbp for the 8x8 block is zero, send no residue */
				1925	ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
				1926
				1927	for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
				1928	{
				1929	/********************************************************/
				1930	/* pack coeff data for entropy coding */
				1931	/********************************************************/
				1932	ps_mb_coeff_data = *pv_mb_coeff_data;
				1933
				1934	/* write number of non zero coefficients */
				1935	ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz;
				1936
				1937	if (*pu1_nnz)
				1938	{
				1939	for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz; coeff_cnt++)
				1940	{
				1941	if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
				1942	{
				1943	/* write residue */
				1944	ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
				1945	u4_s_map \|= mask;
				1946	}
				1947	mask <<= 1;
				1948	}
				1949	/* write significant coeff map */
				1950	ps_mb_coeff_data->i4_sig_map_nnz \|= (u4_s_map << 16);
				1951
				1952	/* update ptr to coeff data */
				1953	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + u4_nnz_cnt;
				1954
				1955	/* cbp */
				1956	u1_cbp_l \|= (1 << b8);
				1957	}
				1958	else
				1959	{
				1960	(*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
				1961	}
				1962	}
				1963
				1964	/* if the 8x8 block has no residue, nothing needs to be sent to entropy */
				1965	if (!(u1_cbp_l & (1 << b8)))
				1966	{
				1967	*pv_mb_coeff_data = ps_mb_coeff_data_b8;
				1968	}
				1969	}
				1970
				1971	/* memcpy recon */
				1972	ps_codec->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE, i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0);
				1973
				1974	return (u1_cbp_l);
				1975	}
				1976
				1977
				1978	/**
				1979	*******************************************************************************
				1980	*
				1981	* @brief performs chroma core coding for intra macro blocks
				1982	*
				1983	* @par Description:
				1984	* If the current MB is to be intra coded with mb type chroma I8x8, the MB is
				1985	* first predicted using intra 8x8 prediction filters. The predicted data is
				1986	* compared with the input for error and the error is transformed. The DC
				1987	* coefficients of each transformed sub blocks are further transformed using
				1988	* Hadamard transform. The resulting coefficients are quantized, packed and sent
				1989	* for entropy coding.
				1990	*
				1991	* @param[in] ps_proc_ctxt
				1992	* pointer to the current macro block context
				1993	*
				1994	* @returns u1_cbp_c
				1995	* coded block pattern chroma
				1996	*
				1997	* @remarks
				1998	* The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
				1999	* mentioned in h.264 specification
				2000	*
				2001	*******************************************************************************
				2002	*/
				2003	UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc)
				2004	{
				2005	/* Codec Context */
				2006	codec_t *ps_codec = ps_proc->ps_codec;
				2007
				2008	/* pointer to ref macro block */
				2009	UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
				2010
				2011	/* pointer to src macro block */
				2012	UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
				2013
				2014	/* pointer to prediction macro block */
				2015	UWORD8 *pu1_pred_mb = NULL;
				2016
				2017	/* pointer to residual macro block */
				2018	WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
				2019
				2020	/* strides */
				2021	WORD32 i4_src_strd = ps_proc->i4_src_strd;
				2022	WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
				2023	WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
				2024	WORD32 i4_res_strd = ps_proc->i4_res_strd;
				2025
				2026	/* intra mode */
				2027	UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode;
				2028
				2029	/* coded block pattern */
				2030	UWORD8 u1_cbp_c = 0;
				2031
				2032	/* number of non zero coeffs*/
				2033	UWORD8 au1_nnz[18] = {0};
				2034
				2035	/* quantization parameters */
				2036	quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
				2037
				2038	/* Control signal for inverse transform */
				2039	UWORD32 u4_cntrl;
				2040
				2041	/* pointer to packed mb coeff data */
				2042	void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
				2043
				2044	/* See if we need to swap U and V plances for entropy */
				2045	UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
				2046
				2047	if (PLANE_CH_I8x8 == u1_intra_mode)
				2048	{
				2049	pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane;
				2050	}
				2051	else
				2052	{
				2053	pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
				2054	}
				2055
				2056	/********************************************************/
				2057	/* error estimation, */
				2058	/* transform */
				2059	/* quantization */
				2060	/********************************************************/
				2061	ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
				2062	pu1_pred_mb, pi2_res_mb,
				2063	i4_src_strd, i4_pred_strd,
				2064	i4_res_strd,
				2065	ps_qp_params->pu2_scale_mat,
				2066	ps_qp_params->pu2_thres_mat,
				2067	ps_qp_params->u1_qbits,
				2068	ps_qp_params->u4_dead_zone,
				2069	au1_nnz);
				2070
				2071	/********************************************************/
				2072	/* pack coeff data for entropy coding */
				2073	/********************************************************/
				2074	ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
				2075	au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
				2076
				2077	/********************************************************/
				2078	/* ierror estimation, */
				2079	/* itransform */
				2080	/* iquantization */
				2081	/********************************************************/
				2082	ih264e_chroma_8x8_idctrans_iquant_itrans_recon(ps_codec, pi2_res_mb,
				2083	pu1_pred_mb, pu1_ref_mb,
				2084	i4_res_strd, i4_pred_strd,
				2085	i4_rec_strd,
				2086	ps_qp_params->pu2_iscale_mat,
				2087	ps_qp_params->pu2_weigh_mat,
				2088	ps_qp_params->u1_qp_div,
				2089	u4_cntrl,
				2090	ps_proc->pv_scratch_buff);
				2091	return (u1_cbp_c);
				2092	}
				2093
				2094
				2095	/**
				2096	*******************************************************************************
				2097	*
				2098	* @brief performs luma core coding when mode is inter
				2099	*
				2100	* @par Description:
				2101	* If the current mb is to be coded as inter the mb is predicted based on the
				2102	* sub mb partitions and corresponding motion vectors generated by ME. Then,
				2103	* error is computed between the input blk and the estimated blk. This error is
				2104	* transformed, quantized. The quantized coefficients are packed in scan order
				2105	* for entropy coding
				2106	*
				2107	* @param[in] ps_proc_ctxt
				2108	* pointer to the current macro block context
				2109	*
				2110	* @returns u1_cbp_l
				2111	* coded block pattern luma
				2112	*
				2113	* @remarks none
				2114	*
				2115	*******************************************************************************
				2116	*/
				2117
				2118	UWORD8 ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t *ps_proc)
				2119	{
				2120	/* Codec Context */
				2121	codec_t *ps_codec = ps_proc->ps_codec;
				2122
				2123	/* pointer to ref macro block */
				2124	UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
				2125
				2126	/* pointer to src macro block */
				2127	UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
				2128
				2129	/* pointer to prediction macro block */
				2130	UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
				2131
				2132	/* pointer to residual macro block */
				2133	WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
				2134
				2135	/* strides */
				2136	WORD32 i4_src_strd = ps_proc->i4_src_strd;
				2137	WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
				2138	WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
				2139	WORD32 i4_res_strd = ps_proc->i4_res_strd;
				2140
				2141	/* coded block pattern */
				2142	UWORD8 u1_cbp_l = 0;
				2143
				2144	/Control signal of itrans/
				2145	UWORD32 u4_cntrl;
				2146
				2147	/* number of non zero coeffs*/
				2148	UWORD8 pu1_nnz = (UWORD8 )ps_proc->au4_nnz;
				2149
				2150	/* quantization parameters */
				2151	quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
				2152
				2153	/* pointer to packed mb coeff data */
				2154	void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
				2155
				2156	/* pseudo pred buffer */
				2157	UWORD8 *pu1_pseudo_pred = pu1_pred_mb;
				2158
				2159	/* pseudo pred buffer stride */
				2160	WORD32 i4_pseudo_pred_strd = i4_pred_strd;
				2161
				2162	/* init nnz */
				2163	ps_proc->au4_nnz[0] = 0;
				2164	ps_proc->au4_nnz[1] = 0;
				2165	ps_proc->au4_nnz[2] = 0;
				2166	ps_proc->au4_nnz[3] = 0;
				2167	ps_proc->au4_nnz[4] = 0;
				2168
				2169	/********************************************************/
				2170	/* prediction */
				2171	/********************************************************/
				2172	ih264e_motion_comp_luma(ps_proc, &pu1_pseudo_pred, &i4_pseudo_pred_strd);
				2173
				2174	/********************************************************/
				2175	/* error estimation, */
				2176	/* transform */
				2177	/* quantization */
				2178	/********************************************************/
				2179	if (ps_proc->u4_min_sad_reached == 0 \|\| ps_proc->u4_min_sad != 0)
				2180	{
				2181	ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
				2182	pu1_pseudo_pred, pi2_res_mb,
				2183	i4_src_strd,
				2184	i4_pseudo_pred_strd,
				2185	i4_res_strd,
				2186	ps_qp_params->pu2_scale_mat,
				2187	ps_qp_params->pu2_thres_mat,
				2188	ps_qp_params->u1_qbits,
				2189	ps_qp_params->u4_dead_zone,
				2190	pu1_nnz,
				2191	DISABLE_DC_TRANSFORM);
				2192
				2193	/********************************************************/
				2194	/* pack coeff data for entropy coding */
				2195	/********************************************************/
				2196	ih264e_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
				2197	pu1_nnz, ps_codec->u4_thres_resi, &u4_cntrl);
				2198	}
				2199	else
				2200	{
				2201	u1_cbp_l = 0;
				2202	u4_cntrl = 0;
				2203	}
				2204
				2205	/********************************************************/
				2206	/* ierror estimation, */
				2207	/* itransform */
				2208	/* iquantization */
				2209	/********************************************************/
				2210
				2211	/*If the frame is not to be used for P frame reference or dumping recon
				2212	* we only will use the reocn for only predicting intra Mbs
				2213	* THis will need only right and bottom edge 4x4 blocks recon
				2214	* Hence we selectively enable them using control signal(including DC)
				2215	*/
				2216	if (ps_proc->u4_compute_recon != 1)
				2217	{
				2218	u4_cntrl &= 0x111F0000;
				2219	}
				2220
				2221	if (u4_cntrl)
				2222	{
				2223	ih264e_luma_16x16_idctrans_iquant_itrans_recon(
				2224	ps_codec, pi2_res_mb, pu1_pseudo_pred, pu1_rec_mb,
				2225	i4_res_strd, i4_pseudo_pred_strd, i4_rec_strd,
				2226	ps_qp_params->pu2_iscale_mat,
				2227	ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
				2228	u4_cntrl /Cntrl/, DISABLE_DC_TRANSFORM,
				2229	ps_proc->pv_scratch_buff);
				2230	}
				2231	else
				2232	{
				2233	ps_codec->pf_inter_pred_luma_copy(pu1_pseudo_pred, pu1_rec_mb,
				2234	i4_pseudo_pred_strd, i4_rec_strd,
				2235	MB_SIZE, MB_SIZE, NULL, 0);
				2236	}
				2237
				2238
				2239	return (u1_cbp_l);
				2240	}
				2241
				2242	/**
				2243	*******************************************************************************
				2244	*
				2245	* @brief performs chroma core coding for inter macro blocks
				2246	*
				2247	* @par Description:
				2248	* If the current mb is to be coded as inter predicted mb,based on the sub mb partitions
				2249	* and corresponding motion vectors generated by ME ,prediction is done.
				2250	* Then, error is computed between the input blk and the estimated blk.
				2251	* This error is transformed , quantized. The quantized coefficients
				2252	* are packed in scan order for
				2253	* entropy coding.
				2254	*
				2255	* @param[in] ps_proc_ctxt
				2256	* pointer to the current macro block context
				2257	*
				2258	* @returns u1_cbp_l
				2259	* coded block pattern chroma
				2260	*
				2261	* @remarks none
				2262	*
				2263	*******************************************************************************
				2264	*/
				2265	UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc)
				2266	{
				2267	/* Codec Context */
				2268	codec_t *ps_codec = ps_proc->ps_codec;
				2269
				2270	/* pointer to ref macro block */
				2271	UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_chroma;
				2272
				2273	/* pointer to src macro block */
				2274	UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
				2275
				2276	/* pointer to prediction macro block */
				2277	UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
				2278
				2279	/* pointer to residual macro block */
				2280	WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
				2281
				2282	/* strides */
				2283	WORD32 i4_src_strd = ps_proc->i4_src_strd;
				2284	WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
				2285	WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
				2286	WORD32 i4_res_strd = ps_proc->i4_res_strd;
				2287
				2288	/* coded block pattern */
				2289	UWORD8 u1_cbp_c = 0;
				2290
				2291	/Control signal for inverse transform/
				2292	UWORD32 u4_cntrl;
				2293
				2294	/* number of non zero coeffs*/
				2295	UWORD8 au1_nnz[10] = {0};
				2296
				2297	/* quantization parameters */
				2298	quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
				2299
				2300	/* pointer to packed mb coeff data */
				2301	void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
				2302
				2303	/See if we need to swap U and V plances for entropy/
				2304	UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
				2305
				2306	/********************************************************/
				2307	/* prediction */
				2308	/********************************************************/
				2309	ih264e_motion_comp_chroma(ps_proc);
				2310
				2311	/********************************************************/
				2312	/* error estimation, */
				2313	/* transform */
				2314	/* quantization */
				2315	/********************************************************/
				2316	ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
				2317	pu1_pred_mb, pi2_res_mb,
				2318	i4_src_strd, i4_pred_strd,
				2319	i4_res_strd,
				2320	ps_qp_params->pu2_scale_mat,
				2321	ps_qp_params->pu2_thres_mat,
				2322	ps_qp_params->u1_qbits,
				2323	ps_qp_params->u4_dead_zone,
				2324	au1_nnz);
				2325
				2326	/********************************************************/
				2327	/* pack coeff data for entropy coding */
				2328	/********************************************************/
				2329	ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
				2330	au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
				2331
				2332	/********************************************************/
				2333	/* ierror estimation, */
				2334	/* itransform */
				2335	/* iquantization */
				2336	/********************************************************/
				2337
				2338	/* If the frame is not to be used for P frame reference or dumping recon
				2339	* we only will use the reocn for only predicting intra Mbs
				2340	* THis will need only right and bottom edge 4x4 blocks recon
				2341	* Hence we selectively enable them using control signal(including DC)
				2342	*/
				2343	if (!ps_proc->u4_compute_recon)
				2344	{
				2345	u4_cntrl &= 0x7700C000;
				2346	}
				2347
				2348	if (u4_cntrl)
				2349	{
				2350	ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
				2351	ps_codec, pi2_res_mb, pu1_pred_mb, pu1_rec_mb,
				2352	i4_res_strd, i4_pred_strd, i4_rec_strd,
				2353	ps_qp_params->pu2_iscale_mat,
				2354	ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
				2355	u4_cntrl, ps_proc->pv_scratch_buff);
				2356	}
				2357	else
				2358	{
				2359	ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_rec_mb, i4_pred_strd,
				2360	i4_rec_strd, MB_SIZE >> 1, MB_SIZE,
				2361	NULL, 0);
				2362	}
				2363
				2364	return (u1_cbp_c);
				2365	}