| /****************************************************************************** |
| * |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| /*****************************************************************************/ |
| /* */ |
| /* File Name : ih264_deblk_chroma_ssse3.c */ |
| /* */ |
| /* Description : Contains function definitions for deblocking */ |
| /* */ |
| /* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */ |
| /* ih264_deblk_chroma_horz_bs4_ssse3() */ |
| /* ih264_deblk_chroma_vert_bslt4_ssse3() */ |
| /* ih264_deblk_chroma_horz_bslt4_ssse3() */ |
| /* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ |
| /* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ |
| /* */ |
| /* Issues / Problems : None */ |
| /* */ |
| /* Revision History : */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */ |
| /* intrinsics */ |
| /* */ |
| /*****************************************************************************/ |
| |
| /*****************************************************************************/ |
| /* File Includes */ |
| /*****************************************************************************/ |
| |
| /* System include files */ |
| #include <stdio.h> |
| |
| /* User include files */ |
| #include "ih264_typedefs.h" |
| #include "ih264_platform_macros.h" |
| #include "ih264_deblk_edge_filters.h" |
| #include "ih264_macros.h" |
| |
| /*****************************************************************************/ |
| /* Function Definitions */ |
| /*****************************************************************************/ |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */ |
| /* */ |
| /* Description : This function performs filtering of a chroma block */ |
| /* vertical edge when the boundary strength is set to 4 in */ |
| /* high profile. */ |
| /* */ |
| /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
| /* src_strd - source stride */ |
| /* alpha_cb - alpha value for the boundary in U */ |
| /* beta_cb - beta value for the boundary in U */ |
| /* alpha_cr - alpha value for the boundary in V */ |
| /* beta_cr - beta value for the boundary in V */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : This operation is described in Sec. 8.7.2.4 under the */ |
| /* title "Filtering process for edges for bS equal to 4" in */ |
| /* ITU T Rec H.264 with alpha and beta values different in */ |
| /* U and V. */ |
| /* */ |
| /* Outputs : None */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Initial version */ |
| /* */ |
| /*****************************************************************************/ |
| void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src, |
| WORD32 src_strd, |
| WORD32 alpha_cb, |
| WORD32 beta_cb, |
| WORD32 alpha_cr, |
| WORD32 beta_cr) |
| { |
| UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
| WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
| WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
| __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; |
| __m128i temp1, temp2, temp3, temp4; |
| |
| __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
| __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
| __m128i flag1, flag2; |
| __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; |
| __m128i zero = _mm_setzero_si128(); |
| __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
| |
| /* Load and transpose the pixel values */ |
| linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
| lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
| linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
| lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
| linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); |
| linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); |
| lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); |
| lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); |
| |
| temp1 = _mm_unpacklo_epi16(linea, lineb); |
| temp2 = _mm_unpacklo_epi16(linec, lined); |
| temp3 = _mm_unpacklo_epi16(linee, linef); |
| temp4 = _mm_unpacklo_epi16(lineg, lineh); |
| |
| p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); |
| p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); |
| q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); |
| q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); |
| |
| p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); |
| p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); |
| q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); |
| q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); |
| /* End of transpose */ |
| |
| q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
| |
| temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
| |
| q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
| |
| temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
| |
| p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
| q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| flag1 = _mm_packs_epi16(flag1, flag2); |
| |
| p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
| p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
| |
| q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
| q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| /* Inverse-transpose and store back */ |
| temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
| temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); |
| temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
| temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); |
| |
| linea = _mm_unpacklo_epi32(temp1, temp3); |
| lineb = _mm_srli_si128(linea, 8); |
| linec = _mm_unpackhi_epi32(temp1, temp3); |
| lined = _mm_srli_si128(linec, 8); |
| linee = _mm_unpacklo_epi32(temp2, temp4); |
| linef = _mm_srli_si128(linee, 8); |
| lineg = _mm_unpackhi_epi32(temp2, temp4); |
| lineh = _mm_srli_si128(lineg, 8); |
| |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); |
| |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */ |
| /* */ |
| /* Description : This function performs filtering of a chroma block */ |
| /* horizontal edge when the boundary strength is set to 4 */ |
| /* in high profile. */ |
| /* */ |
| /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
| /* src_strd - source stride */ |
| /* alpha_cb - alpha value for the boundary in U */ |
| /* beta_cb - beta value for the boundary in U */ |
| /* alpha_cr - alpha value for the boundary in V */ |
| /* beta_cr - beta value for the boundary in V */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : This operation is described in Sec. 8.7.2.4 under the */ |
| /* title "Filtering process for edges for bS equal to 4" in */ |
| /* ITU T Rec H.264 with alpha and beta values different in */ |
| /* U and V. */ |
| /* */ |
| /* Outputs : None */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Initial version */ |
| /* */ |
| /*****************************************************************************/ |
| void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src, |
| WORD32 src_strd, |
| WORD32 alpha_cb, |
| WORD32 beta_cb, |
| WORD32 alpha_cr, |
| WORD32 beta_cr) |
| { |
| UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
| WORD16 i16_posP1, i16_posP0, i16_posQ1; |
| |
| UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ |
| WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
| WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
| __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
| __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
| __m128i flag1, flag2; |
| __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; |
| __m128i zero = _mm_setzero_si128(); |
| __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
| __m128i temp1, temp2; |
| |
| pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); |
| |
| i16_posQ1 = src_strd; |
| i16_posP0 = src_strd; |
| i16_posP1 = 0; |
| |
| q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); |
| q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); |
| p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); |
| p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); |
| |
| q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
| |
| temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
| |
| q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
| |
| temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); |
| |
| p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
| q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| flag1 = _mm_packs_epi16(flag1, flag2); |
| |
| p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
| p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
| _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); |
| |
| q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
| q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
| _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); |
| |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */ |
| /* */ |
| /* Description : This function performs filtering of a chroma block */ |
| /* vertical edge when the boundary strength is less than 4 */ |
| /* in high profile. */ |
| /* */ |
| /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
| /* src_strd - source stride */ |
| /* alpha_cb - alpha value for the boundary in U */ |
| /* beta_cb - beta value for the boundary in U */ |
| /* alpha_cr - alpha value for the boundary in V */ |
| /* beta_cr - beta value for the boundary in V */ |
| /* u4_bs - packed Boundary strength array */ |
| /* pu1_cliptab_cb - tc0_table for U */ |
| /* pu1_cliptab_cr - tc0_table for V */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
| /* title "Filtering process for edges for bS less than 4" */ |
| /* in ITU T Rec H.264 with alpha and beta values different */ |
| /* in U and V. */ |
| /* */ |
| /* Outputs : None */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Initial version */ |
| /* */ |
| /*****************************************************************************/ |
| void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src, |
| WORD32 src_strd, |
| WORD32 alpha_cb, |
| WORD32 beta_cb, |
| WORD32 alpha_cr, |
| WORD32 beta_cr, |
| UWORD32 u4_bs, |
| const UWORD8 *pu1_cliptab_cb, |
| const UWORD8 *pu1_cliptab_cr) |
| { |
| UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
| UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
| WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
| WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
| __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; |
| __m128i temp1, temp2, temp3, temp4; |
| |
| __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
| __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
| __m128i flag_bs, flag1, flag2; |
| __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; |
| __m128i zero = _mm_setzero_si128(); |
| __m128i C0_uv_8x16; |
| __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
| |
| u1_Bs0 = (u4_bs >> 24) & 0xff; |
| u1_Bs1 = (u4_bs >> 16) & 0xff; |
| u1_Bs2 = (u4_bs >> 8) & 0xff; |
| u1_Bs3 = (u4_bs >> 0) & 0xff; |
| |
| flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, |
| u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
| u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
| flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
| flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask |
| |
| /* Load and transpose the pixel values */ |
| linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
| lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
| linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
| lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
| linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); |
| linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); |
| lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); |
| lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); |
| |
| temp1 = _mm_unpacklo_epi16(linea, lineb); |
| temp2 = _mm_unpacklo_epi16(linec, lined); |
| temp3 = _mm_unpacklo_epi16(linee, linef); |
| temp4 = _mm_unpacklo_epi16(lineg, lineh); |
| |
| p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); |
| p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); |
| q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); |
| q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); |
| |
| p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); |
| p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); |
| q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); |
| q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); |
| /* End of transpose */ |
| |
| q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
| diff = _mm_slli_epi16(diff, 2); |
| diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
| diff = _mm_add_epi16(diff, diff1); |
| diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
| in_macro = _mm_srai_epi16(diff, 3); |
| |
| C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
| pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
| pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
| pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
| |
| C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
| |
| in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
| C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
| in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); |
| q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
| |
| q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
| diff = _mm_slli_epi16(diff, 2); |
| diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
| diff = _mm_add_epi16(diff, diff1); |
| diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
| in_macro = _mm_srai_epi16(diff, 3); |
| |
| C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
| pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
| pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
| pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); |
| |
| C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
| |
| in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
| C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
| in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); |
| q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
| q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| flag1 = _mm_packs_epi16(flag1, flag2); |
| flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
| |
| p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
| p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
| |
| q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
| q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| /* Inverse-transpose and store back */ |
| temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
| temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); |
| temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
| temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); |
| |
| linea = _mm_unpacklo_epi32(temp1, temp3); |
| lineb = _mm_srli_si128(linea, 8); |
| linec = _mm_unpackhi_epi32(temp1, temp3); |
| lined = _mm_srli_si128(linec, 8); |
| linee = _mm_unpacklo_epi32(temp2, temp4); |
| linef = _mm_srli_si128(linee, 8); |
| lineg = _mm_unpackhi_epi32(temp2, temp4); |
| lineh = _mm_srli_si128(lineg, 8); |
| |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); |
| |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */ |
| /* */ |
| /* Description : This function performs filtering of a chroma block */ |
| /* horizontal edge when the boundary strength is less than */ |
| /* 4 in high profile. */ |
| /* */ |
| /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
| /* src_strd - source stride */ |
| /* alpha_cb - alpha value for the boundary in U */ |
| /* beta_cb - beta value for the boundary in U */ |
| /* alpha_cr - alpha value for the boundary in V */ |
| /* beta_cr - beta value for the boundary in V */ |
| /* u4_bs - packed Boundary strength array */ |
| /* pu1_cliptab_cb - tc0_table for U */ |
| /* pu1_cliptab_cr - tc0_table for V */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : This operation is described in Sec. 8.7.2.3 under the */ |
| /* title "Filtering process for edges for bS less than 4" */ |
| /* in ITU T Rec H.264 with alpha and beta values different */ |
| /* in U and V. */ |
| /* */ |
| /* Outputs : None */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Initial version */ |
| /* */ |
| /*****************************************************************************/ |
| void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src, |
| WORD32 src_strd, |
| WORD32 alpha_cb, |
| WORD32 beta_cb, |
| WORD32 alpha_cr, |
| WORD32 beta_cr, |
| UWORD32 u4_bs, |
| const UWORD8 *pu1_cliptab_cb, |
| const UWORD8 *pu1_cliptab_cr) |
| { |
| UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
| WORD16 i16_posP1, i16_posP0, i16_posQ1; |
| UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
| |
| UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ |
| WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
| WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
| __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
| __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
| __m128i flag_bs, flag1, flag2; |
| __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; |
| __m128i zero = _mm_setzero_si128(); |
| __m128i C0_uv_8x16; |
| __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
| |
| pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); |
| |
| i16_posQ1 = src_strd; |
| i16_posP0 = src_strd; |
| i16_posP1 = 0; |
| |
| u1_Bs0 = (u4_bs >> 24) & 0xff; |
| u1_Bs1 = (u4_bs >> 16) & 0xff; |
| u1_Bs2 = (u4_bs >> 8) & 0xff; |
| u1_Bs3 = (u4_bs >> 0) & 0xff; |
| |
| flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, |
| u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, |
| u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); |
| flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
| flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask |
| |
| q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); |
| q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); |
| p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); |
| p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); |
| |
| q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
| diff = _mm_slli_epi16(diff, 2); |
| diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
| diff = _mm_add_epi16(diff, diff1); |
| diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
| in_macro = _mm_srai_epi16(diff, 3); |
| |
| C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
| pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
| pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], |
| pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
| |
| C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
| |
| in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
| C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
| in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); |
| q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
| |
| q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
| diff = _mm_slli_epi16(diff, 2); |
| diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
| diff = _mm_add_epi16(diff, diff1); |
| diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
| in_macro = _mm_srai_epi16(diff, 3); |
| |
| C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
| pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
| pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
| pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); |
| |
| C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
| |
| in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
| C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
| in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); |
| q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); |
| q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| flag1 = _mm_packs_epi16(flag1, flag2); |
| flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
| |
| p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
| p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
| _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); |
| |
| q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
| q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
| _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); |
| |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ |
| /* */ |
| /* Description : This function performs filtering of a chroma block */ |
| /* vertical edge when boundary strength is set to 4 in high */ |
| /* profile. */ |
| /* */ |
| /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
| /* src_strd - source stride */ |
| /* alpha_cb - alpha value for the boundary in U */ |
| /* beta_cb - beta value for the boundary in U */ |
| /* alpha_cr - alpha value for the boundary in V */ |
| /* beta_cr - beta value for the boundary in V */ |
| /* u4_bs - packed Boundary strength array */ |
| /* pu1_cliptab_cb - tc0_table for U */ |
| /* pu1_cliptab_cr - tc0_table for V */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : When the function is called twice, this operation is as */ |
| /* described in Sec. 8.7.2.4 under the title "Filtering */ |
| /* process for edges for bS equal to 4" in ITU T Rec H.264 */ |
| /* with alpha and beta values different in U and V. */ |
| /* */ |
| /* Outputs : None */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Initial version */ |
| /* */ |
| /*****************************************************************************/ |
| void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, |
| WORD32 src_strd, |
| WORD32 alpha_cb, |
| WORD32 beta_cb, |
| WORD32 alpha_cr, |
| WORD32 beta_cr) |
| { |
| UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
| WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
| WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
| __m128i linea, lineb, linec, lined; |
| __m128i temp1, temp2; |
| |
| __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
| __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
| __m128i flag1; |
| __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; |
| __m128i zero = _mm_setzero_si128(); |
| __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
| |
| /* Load and transpose the pixel values */ |
| linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
| lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
| linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
| lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
| |
| temp1 = _mm_unpacklo_epi16(linea, lineb); |
| temp2 = _mm_unpacklo_epi16(linec, lined); |
| |
| p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); |
| p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); |
| q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); |
| q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); |
| /* End of transpose */ |
| |
| q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| temp1 = _mm_slli_epi16(p1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
| |
| temp1 = _mm_slli_epi16(q1_uv_8x16, 1); |
| temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); |
| temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); |
| temp1 = _mm_add_epi16(temp1, temp2); |
| q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); |
| |
| p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); |
| q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); |
| |
| flag1 = _mm_packs_epi16(flag1, flag1); |
| |
| p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
| p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
| |
| q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
| q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| /* Inverse-transpose and store back */ |
| temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
| temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
| |
| linea = _mm_unpacklo_epi32(temp1, temp2); |
| lineb = _mm_srli_si128(linea, 8); |
| linec = _mm_unpackhi_epi32(temp1, temp2); |
| lined = _mm_srli_si128(linec, 8); |
| |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
| |
| } |
| |
| /*****************************************************************************/ |
| /* */ |
| /* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ |
| /* */ |
| /* Description : This function performs filtering of a chroma block */ |
| /* vertical edge when boundary strength is less than 4 in */ |
| /* high profile. */ |
| /* */ |
| /* Inputs : pu1_src - pointer to the src sample q0 of U */ |
| /* src_strd - source stride */ |
| /* alpha_cb - alpha value for the boundary in U */ |
| /* beta_cb - beta value for the boundary in U */ |
| /* alpha_cr - alpha value for the boundary in V */ |
| /* beta_cr - beta value for the boundary in V */ |
| /* u4_bs - packed Boundary strength array */ |
| /* pu1_cliptab_cb - tc0_table for U */ |
| /* pu1_cliptab_cr - tc0_table for V */ |
| /* */ |
| /* Globals : None */ |
| /* */ |
| /* Processing : When the function is called twice, this operation is as */ |
| /* described in Sec. 8.7.2.4 under the title "Filtering */ |
| /* process for edges for bS less than 4" in ITU T Rec H.264 */ |
| /* with alpha and beta values different in U and V. */ |
| /* */ |
| /* Outputs : None */ |
| /* */ |
| /* Returns : None */ |
| /* */ |
| /* Issues : None */ |
| /* */ |
| /* Revision History: */ |
| /* */ |
| /* DD MM YYYY Author(s) Changes (Describe the changes made) */ |
| /* 12 02 2015 Naveen Kumar P Initial version */ |
| /* */ |
| /*****************************************************************************/ |
| void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, |
| WORD32 src_strd, |
| WORD32 alpha_cb, |
| WORD32 beta_cb, |
| WORD32 alpha_cr, |
| WORD32 beta_cr, |
| UWORD32 u4_bs, |
| const UWORD8 *pu1_cliptab_cb, |
| const UWORD8 *pu1_cliptab_cr) |
| { |
| UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ |
| UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; |
| WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; |
| WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; |
| __m128i linea, lineb, linec, lined; |
| __m128i temp1, temp2; |
| |
| __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; |
| __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; |
| __m128i flag_bs, flag1; |
| __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; |
| __m128i zero = _mm_setzero_si128(); |
| __m128i C0_uv_8x16; |
| __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; |
| |
| u1_Bs0 = (u4_bs >> 24) & 0xff; |
| u1_Bs1 = (u4_bs >> 16) & 0xff; |
| u1_Bs2 = (u4_bs >> 8) & 0xff; |
| u1_Bs3 = (u4_bs >> 0) & 0xff; |
| |
| flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, |
| u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); |
| flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s |
| flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask |
| |
| /* Load and transpose the pixel values */ |
| linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); |
| lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); |
| linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); |
| lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); |
| |
| temp1 = _mm_unpacklo_epi16(linea, lineb); |
| temp2 = _mm_unpacklo_epi16(linec, lined); |
| |
| p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); |
| p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); |
| q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); |
| q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); |
| /* End of transpose */ |
| |
| q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); |
| q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); |
| p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); |
| p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); |
| |
| diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 |
| diff = _mm_abs_epi16(diff); |
| alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); |
| flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); |
| |
| diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 |
| diff = _mm_abs_epi16(diff); |
| beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 |
| diff = _mm_abs_epi16(diff); |
| flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); |
| |
| diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); |
| diff = _mm_slli_epi16(diff, 2); |
| diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); |
| diff = _mm_add_epi16(diff, diff1); |
| diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); |
| in_macro = _mm_srai_epi16(diff, 3); |
| |
| C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], |
| pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], |
| pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], |
| pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); |
| |
| C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); |
| |
| in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 |
| C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); |
| in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); |
| q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); |
| |
| p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); |
| q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); |
| |
| flag1 = _mm_packs_epi16(flag1, flag1); |
| flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) |
| |
| p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); |
| p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); |
| |
| q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, |
| _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); |
| q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); |
| q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); |
| |
| /* Inverse-transpose and store back */ |
| temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); |
| temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); |
| |
| linea = _mm_unpacklo_epi32(temp1, temp2); |
| lineb = _mm_srli_si128(linea, 8); |
| linec = _mm_unpackhi_epi32(temp1, temp2); |
| lined = _mm_srli_si128(linec, 8); |
| |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); |
| _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); |
| |
| } |
| |