common/x86/ih264_chroma_intra_pred_filters_ssse3.c - platform/external/libavc - Gitiles

 /******************************************************************************
  *
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at:
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  *****************************************************************************
  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 */
 /**
 *******************************************************************************
 * @file
 *  ih264_chroma_intra_pred_filters_ssse3.c
 *
 * @brief
 *  Contains function definitions for chroma intra prediction filters in x86
 *  intrinsics
 *
 * @author
 *  Ittiam
 *
 * @par List of Functions:
 *  -ih264_intra_pred_chroma_8x8_mode_horz_ssse3
 *  -ih264_intra_pred_chroma_8x8_mode_vert_ssse3
 *  -ih264_intra_pred_chroma_8x8_mode_plane_ssse3
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */

 /*****************************************************************************/
 /* File Includes                                                             */
 /*****************************************************************************/

 /* System include files */
 #include <stdio.h>
 #include <stddef.h>
 #include <string.h>

 /* User include files */
 #include "ih264_defs.h"
 #include "ih264_typedefs.h"
 #include "ih264_macros.h"
 #include "ih264_platform_macros.h"
 #include "ih264_intra_pred_filters.h"


 /*****************************************************************************/
 /* Chroma Intra prediction 8x8 filters                                       */
 /*****************************************************************************/
 /**
 *******************************************************************************
 *
 * ih264_intra_pred_chroma_8x8_mode_horz_ssse3
 *
 * @brief
 *  Perform Intra prediction for chroma_8x8 mode:Horizontal
 *
 * @par Description:
 *  Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source containing alternate U and V samples
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination with alternate U and V samples
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] ngbr_avail
 * availability of neighbouring pixels(Not used in this function)
 *
 * @returns
 *
 * @remarks
 *  None
 *
 ******************************************************************************
 */
 void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
                                                  UWORD8 *pu1_dst,
                                                  WORD32 src_strd,
                                                  WORD32 dst_strd,
                                                  WORD32 ngbr_avail)
 {

     UWORD8 *pu1_left; /* Pointer to start of top predictors */
     WORD32 dst_strd2;

     __m128i row1_16x8b, row2_16x8b;

     UNUSED(src_strd);
     UNUSED(ngbr_avail);

     pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;


     dst_strd2 = dst_strd << 1;
     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left)));
     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);

     pu1_dst += dst_strd2;
     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4)));
     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);

     pu1_dst += dst_strd2;
     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8)));
     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);

     pu1_dst += dst_strd2;
     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12)));
     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14)));
     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
 }

 /**
 *******************************************************************************
 *
 * ih264_intra_pred_chroma_8x8_mode_vert_ssse3
 *
 * @brief
 *  Perform Intra prediction for  chroma_8x8 mode:vertical
 *
 * @par Description:
 *  Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source containing alternate U and V samples
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination with alternate U and V samples
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] ngbr_avail
 * availability of neighbouring pixels(Not used in this function)
 *
 * @returns
 *
 * @remarks
 *  None
 *
 *******************************************************************************
 */
 void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
                                                  UWORD8 *pu1_dst,
                                                  WORD32 src_strd,
                                                  WORD32 dst_strd,
                                                  WORD32 ngbr_avail)
 {
     UWORD8 *pu1_top; /* Pointer to start of top predictors */
     WORD32 dst_strd2;

     __m128i top_16x8b;

     UNUSED(src_strd);
     UNUSED(ngbr_avail);

     pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;

     top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);

     dst_strd2 = dst_strd << 1;
     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);

     pu1_dst += dst_strd2;
     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);

     pu1_dst += dst_strd2;
     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);

     pu1_dst += dst_strd2;
     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
 }

 /**
 *******************************************************************************
 *
 * ih264_intra_pred_chroma_8x8_mode_plane_ssse3
 *
 * @brief
 *  Perform Intra prediction for chroma_8x8 mode:PLANE
 *
 * @par Description:
 *  Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
 *
 * @param[in] pu1_src
 *  UWORD8 pointer to the source containing alternate U and V samples
 *
 * @param[out] pu1_dst
 *  UWORD8 pointer to the destination with alternate U and V samples
 *
 * @param[in] src_strd
 *  integer source stride
 *
 * @param[in] dst_strd
 *  integer destination stride
 *
 * @param[in] ngbr_avail
 * availability of neighbouring pixels(Not used in this function)
 *
 * @returns
 *
 * @remarks
 *  None
 *
 ******************************************************************************
 */
 void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
                                                   UWORD8 *pu1_dst,
                                                   WORD32 src_strd,
                                                   WORD32 dst_strd,
                                                   WORD32 ngbr_avail)
 {
     UWORD8 *pu1_left, *pu1_top;
     WORD32 a_u, a_v, b_u, b_v, c_u, c_v;

     __m128i mul_8x16b, shuffle_8x16b;

     UNUSED(src_strd);
     UNUSED(ngbr_avail);

     pu1_top = pu1_src + MB_SIZE + 2;
     pu1_left = pu1_src + MB_SIZE - 2;

     mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
     shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06,
                                    0xff01, 0xff03, 0xff05, 0xff07);

     //calculating a, b and c
     {
         WORD32 h_u, h_v, v_u, v_v;

         __m128i h_val1_16x8b, h_val2_16x8b;
         __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
         __m128i v_val1_16x8b, v_val2_16x8b;
         __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
         __m128i hv_val_4x32b;

         h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
         h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2));
         v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14));
         v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4));

         // reversing the order
         h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b);
         v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b);

         // separating u and v and 8-bit to 16-bit conversion
         h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b);
         h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b);
         v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b);
         v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b);

         h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
         v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);

         h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
         v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);

         hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);

         a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4;
         a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4;

         h_u = _mm_extract_epi16(hv_val_4x32b, 0);
         h_v = _mm_extract_epi16(hv_val_4x32b, 2);
         v_u = _mm_extract_epi16(hv_val_4x32b, 4);
         v_v = _mm_extract_epi16(hv_val_4x32b, 6);

         h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2
         h_v = (h_v << 16) >> 15;
         v_u = (v_u << 16) >> 15;
         v_v = (v_v << 16) >> 15;

         b_u = ((h_u << 4) + h_u + 32) >> 6;
         b_v = ((h_v << 4) + h_v + 32) >> 6;
         c_u = ((v_u << 4) + v_u + 32) >> 6;
         c_v = ((v_v << 4) + v_v + 32) >> 6;
     }
     //using a, b and c to compute the fitted plane values
     {
         __m128i const_8x16b, c2_8x16b;
         __m128i res1_l_8x16b, res1_h_8x16b;
         __m128i res2_l_8x16b, res2_h_8x16b;
         __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
         __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;

         WORD32 b_u2, b_v2, b_u3, b_v3;
         WORD32 const_u, const_v;
         WORD32 dst_strd2;

         const_u = a_u - (c_u << 1) - c_u + 16;
         const_v = a_v - (c_v << 1) - c_v + 16;

         b_u2 = b_u << 1;
         b_v2 = b_v << 1;
         b_u3 = b_u + b_u2;
         b_v3 = b_v + b_v2;

         const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v);
         res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0);
         //contains {-b*3, -b*2, -b*1, b*0}
         res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2);
         //contains {b*1, b*2, b*3, b*4}
         c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v);

         // rows 1, 2
         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
         res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
         res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);

         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

         dst_strd2 = dst_strd << 1;
         c2_8x16b = _mm_slli_epi16(c2_8x16b, 1);

         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

         // rows 3, 4
         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);

         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

         pu1_dst += dst_strd2;

         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

         // rows 5, 6
         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);

         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

         pu1_dst += dst_strd2;

         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

         // rows 7, 8
         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);

         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

         pu1_dst += dst_strd2;

         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

     }
 }
	/******************************************************************************
	*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at:
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	*****************************************************************************
	* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
	*/
	/**
	*******************************************************************************
	* @file
	* ih264_chroma_intra_pred_filters_ssse3.c
	*
	* @brief
	* Contains function definitions for chroma intra prediction filters in x86
	* intrinsics
	*
	* @author
	* Ittiam
	*
	* @par List of Functions:
	* -ih264_intra_pred_chroma_8x8_mode_horz_ssse3
	* -ih264_intra_pred_chroma_8x8_mode_vert_ssse3
	* -ih264_intra_pred_chroma_8x8_mode_plane_ssse3
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/

	/*****************************************************************************/
	/* File Includes */
	/*****************************************************************************/

	/* System include files */
	#include <stdio.h>
	#include <stddef.h>
	#include <string.h>

	/* User include files */
	#include "ih264_defs.h"
	#include "ih264_typedefs.h"
	#include "ih264_macros.h"
	#include "ih264_platform_macros.h"
	#include "ih264_intra_pred_filters.h"


	/*****************************************************************************/
	/* Chroma Intra prediction 8x8 filters */
	/*****************************************************************************/
	/**
	*******************************************************************************
	*
	* ih264_intra_pred_chroma_8x8_mode_horz_ssse3
	*
	* @brief
	* Perform Intra prediction for chroma_8x8 mode:Horizontal
	*
	* @par Description:
	* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source containing alternate U and V samples
	*
	* @param[out] pu1_dst
	* UWORD8 pointer to the destination with alternate U and V samples
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @param[in] ngbr_avail
	* availability of neighbouring pixels(Not used in this function)
	*
	* @returns
	*
	* @remarks
	* None
	*
	******************************************************************************
	*/
	void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
	UWORD8 *pu1_dst,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD32 ngbr_avail)
	{

	UWORD8 pu1_left; / Pointer to start of top predictors */
	WORD32 dst_strd2;

	__m128i row1_16x8b, row2_16x8b;

	UNUSED(src_strd);
	UNUSED(ngbr_avail);

	pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;


	dst_strd2 = dst_strd << 1;
	row1_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left)));
	row2_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 2)));
	_mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);

	pu1_dst += dst_strd2;
	row1_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 4)));
	row2_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 6)));
	_mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);

	pu1_dst += dst_strd2;
	row1_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 8)));
	row2_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 10)));
	_mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);

	pu1_dst += dst_strd2;
	row1_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 12)));
	row2_16x8b = _mm_set1_epi16(((WORD16 )(pu1_left - 14)));
	_mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
	}

	/**
	*******************************************************************************
	*
	* ih264_intra_pred_chroma_8x8_mode_vert_ssse3
	*
	* @brief
	* Perform Intra prediction for chroma_8x8 mode:vertical
	*
	* @par Description:
	* Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source containing alternate U and V samples
	*
	* @param[out] pu1_dst
	* UWORD8 pointer to the destination with alternate U and V samples
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @param[in] ngbr_avail
	* availability of neighbouring pixels(Not used in this function)
	*
	* @returns
	*
	* @remarks
	* None
	*
	*******************************************************************************
	*/
	void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
	UWORD8 *pu1_dst,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD32 ngbr_avail)
	{
	UWORD8 pu1_top; / Pointer to start of top predictors */
	WORD32 dst_strd2;

	__m128i top_16x8b;

	UNUSED(src_strd);
	UNUSED(ngbr_avail);

	pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;

	top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);

	dst_strd2 = dst_strd << 1;
	_mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);

	pu1_dst += dst_strd2;
	_mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);

	pu1_dst += dst_strd2;
	_mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);

	pu1_dst += dst_strd2;
	_mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
	}

	/**
	*******************************************************************************
	*
	* ih264_intra_pred_chroma_8x8_mode_plane_ssse3
	*
	* @brief
	* Perform Intra prediction for chroma_8x8 mode:PLANE
	*
	* @par Description:
	* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
	*
	* @param[in] pu1_src
	* UWORD8 pointer to the source containing alternate U and V samples
	*
	* @param[out] pu1_dst
	* UWORD8 pointer to the destination with alternate U and V samples
	*
	* @param[in] src_strd
	* integer source stride
	*
	* @param[in] dst_strd
	* integer destination stride
	*
	* @param[in] ngbr_avail
	* availability of neighbouring pixels(Not used in this function)
	*
	* @returns
	*
	* @remarks
	* None
	*
	******************************************************************************
	*/
	void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
	UWORD8 *pu1_dst,
	WORD32 src_strd,
	WORD32 dst_strd,
	WORD32 ngbr_avail)
	{
	UWORD8 pu1_left, pu1_top;
	WORD32 a_u, a_v, b_u, b_v, c_u, c_v;

	__m128i mul_8x16b, shuffle_8x16b;

	UNUSED(src_strd);
	UNUSED(ngbr_avail);

	pu1_top = pu1_src + MB_SIZE + 2;
	pu1_left = pu1_src + MB_SIZE - 2;

	mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
	shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06,
	0xff01, 0xff03, 0xff05, 0xff07);

	//calculating a, b and c
	{
	WORD32 h_u, h_v, v_u, v_v;

	__m128i h_val1_16x8b, h_val2_16x8b;
	__m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
	__m128i v_val1_16x8b, v_val2_16x8b;
	__m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
	__m128i hv_val_4x32b;

	h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
	h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2));
	v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14));
	v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4));

	// reversing the order
	h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b);
	v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b);

	// separating u and v and 8-bit to 16-bit conversion
	h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b);
	h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b);
	v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b);
	v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b);

	h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
	v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);

	h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
	v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);

	hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);

	a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4;
	a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4;

	h_u = _mm_extract_epi16(hv_val_4x32b, 0);
	h_v = _mm_extract_epi16(hv_val_4x32b, 2);
	v_u = _mm_extract_epi16(hv_val_4x32b, 4);
	v_v = _mm_extract_epi16(hv_val_4x32b, 6);

	h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2
	h_v = (h_v << 16) >> 15;
	v_u = (v_u << 16) >> 15;
	v_v = (v_v << 16) >> 15;

	b_u = ((h_u << 4) + h_u + 32) >> 6;
	b_v = ((h_v << 4) + h_v + 32) >> 6;
	c_u = ((v_u << 4) + v_u + 32) >> 6;
	c_v = ((v_v << 4) + v_v + 32) >> 6;
	}
	//using a, b and c to compute the fitted plane values
	{
	__m128i const_8x16b, c2_8x16b;
	__m128i res1_l_8x16b, res1_h_8x16b;
	__m128i res2_l_8x16b, res2_h_8x16b;
	__m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
	__m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;

	WORD32 b_u2, b_v2, b_u3, b_v3;
	WORD32 const_u, const_v;
	WORD32 dst_strd2;

	const_u = a_u - (c_u << 1) - c_u + 16;
	const_v = a_v - (c_v << 1) - c_v + 16;

	b_u2 = b_u << 1;
	b_v2 = b_v << 1;
	b_u3 = b_u + b_u2;
	b_v3 = b_v + b_v2;

	const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v);
	res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0);
	//contains {-b3, -b2, -b1, b0}
	res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2);
	//contains {b1, b2, b3, b4}
	c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v);

	// rows 1, 2
	res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
	res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
	res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
	res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);

	res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
	res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
	res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
	res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

	dst_strd2 = dst_strd << 1;
	c2_8x16b = _mm_slli_epi16(c2_8x16b, 1);

	res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
	res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

	_mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

	// rows 3, 4
	res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
	res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
	res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
	res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);

	res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
	res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
	res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
	res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

	pu1_dst += dst_strd2;

	res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
	res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

	_mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

	// rows 5, 6
	res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
	res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
	res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
	res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);

	res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
	res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
	res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
	res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

	pu1_dst += dst_strd2;

	res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
	res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

	_mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

	// rows 7, 8
	res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
	res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
	res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
	res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);

	res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
	res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
	res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
	res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);

	pu1_dst += dst_strd2;

	res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
	res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);

	_mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
	_mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);

	}
	}