| /****************************************************************************** |
| * |
| * Copyright (C) 2015 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at: |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * |
| ***************************************************************************** |
| * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| */ |
| /** |
| ******************************************************************************* |
| * @file |
| * ih264_chroma_intra_pred_filters_ssse3.c |
| * |
| * @brief |
| * Contains function definitions for chroma intra prediction filters in x86 |
| * intrinsics |
| * |
| * @author |
| * Ittiam |
| * |
| * @par List of Functions: |
| * -ih264_intra_pred_chroma_8x8_mode_horz_ssse3 |
| * -ih264_intra_pred_chroma_8x8_mode_vert_ssse3 |
| * -ih264_intra_pred_chroma_8x8_mode_plane_ssse3 |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| |
| /*****************************************************************************/ |
| /* File Includes */ |
| /*****************************************************************************/ |
| |
| /* System include files */ |
| #include <stdio.h> |
| #include <stddef.h> |
| #include <string.h> |
| |
| /* User include files */ |
| #include "ih264_defs.h" |
| #include "ih264_typedefs.h" |
| #include "ih264_macros.h" |
| #include "ih264_platform_macros.h" |
| #include "ih264_intra_pred_filters.h" |
| |
| |
| /*****************************************************************************/ |
| /* Chroma Intra prediction 8x8 filters */ |
| /*****************************************************************************/ |
| /** |
| ******************************************************************************* |
| * |
| * ih264_intra_pred_chroma_8x8_mode_horz_ssse3 |
| * |
| * @brief |
| * Perform Intra prediction for chroma_8x8 mode:Horizontal |
| * |
| * @par Description: |
| * Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source containing alternate U and V samples |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination with alternate U and V samples |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] ngbr_avail |
| * availability of neighbouring pixels(Not used in this function) |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ****************************************************************************** |
| */ |
| void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 dst_strd, |
| WORD32 ngbr_avail) |
| { |
| |
| UWORD8 *pu1_left; /* Pointer to start of top predictors */ |
| WORD32 dst_strd2; |
| |
| __m128i row1_16x8b, row2_16x8b; |
| |
| UNUSED(src_strd); |
| UNUSED(ngbr_avail); |
| |
| pu1_left = pu1_src + 2 * BLK8x8SIZE - 2; |
| |
| |
| dst_strd2 = dst_strd << 1; |
| row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left))); |
| row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2))); |
| _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); |
| |
| pu1_dst += dst_strd2; |
| row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4))); |
| row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6))); |
| _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); |
| |
| pu1_dst += dst_strd2; |
| row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8))); |
| row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10))); |
| _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); |
| |
| pu1_dst += dst_strd2; |
| row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12))); |
| row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14))); |
| _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * ih264_intra_pred_chroma_8x8_mode_vert_ssse3 |
| * |
| * @brief |
| * Perform Intra prediction for chroma_8x8 mode:vertical |
| * |
| * @par Description: |
| * Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source containing alternate U and V samples |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination with alternate U and V samples |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] ngbr_avail |
| * availability of neighbouring pixels(Not used in this function) |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ******************************************************************************* |
| */ |
| void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 dst_strd, |
| WORD32 ngbr_avail) |
| { |
| UWORD8 *pu1_top; /* Pointer to start of top predictors */ |
| WORD32 dst_strd2; |
| |
| __m128i top_16x8b; |
| |
| UNUSED(src_strd); |
| UNUSED(ngbr_avail); |
| |
| pu1_top = pu1_src + 2 * BLK8x8SIZE + 2; |
| |
| top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); |
| |
| dst_strd2 = dst_strd << 1; |
| _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); |
| |
| pu1_dst += dst_strd2; |
| _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); |
| |
| pu1_dst += dst_strd2; |
| _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); |
| |
| pu1_dst += dst_strd2; |
| _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); |
| } |
| |
| /** |
| ******************************************************************************* |
| * |
| * ih264_intra_pred_chroma_8x8_mode_plane_ssse3 |
| * |
| * @brief |
| * Perform Intra prediction for chroma_8x8 mode:PLANE |
| * |
| * @par Description: |
| * Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 |
| * |
| * @param[in] pu1_src |
| * UWORD8 pointer to the source containing alternate U and V samples |
| * |
| * @param[out] pu1_dst |
| * UWORD8 pointer to the destination with alternate U and V samples |
| * |
| * @param[in] src_strd |
| * integer source stride |
| * |
| * @param[in] dst_strd |
| * integer destination stride |
| * |
| * @param[in] ngbr_avail |
| * availability of neighbouring pixels(Not used in this function) |
| * |
| * @returns |
| * |
| * @remarks |
| * None |
| * |
| ****************************************************************************** |
| */ |
| void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src, |
| UWORD8 *pu1_dst, |
| WORD32 src_strd, |
| WORD32 dst_strd, |
| WORD32 ngbr_avail) |
| { |
| UWORD8 *pu1_left, *pu1_top; |
| WORD32 a_u, a_v, b_u, b_v, c_u, c_v; |
| |
| __m128i mul_8x16b, shuffle_8x16b; |
| |
| UNUSED(src_strd); |
| UNUSED(ngbr_avail); |
| |
| pu1_top = pu1_src + MB_SIZE + 2; |
| pu1_left = pu1_src + MB_SIZE - 2; |
| |
| mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4); |
| shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06, |
| 0xff01, 0xff03, 0xff05, 0xff07); |
| |
| //calculating a, b and c |
| { |
| WORD32 h_u, h_v, v_u, v_v; |
| |
| __m128i h_val1_16x8b, h_val2_16x8b; |
| __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; |
| __m128i v_val1_16x8b, v_val2_16x8b; |
| __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b; |
| __m128i hv_val_4x32b; |
| |
| h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8)); |
| h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2)); |
| v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14)); |
| v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4)); |
| |
| // reversing the order |
| h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b); |
| v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b); |
| |
| // separating u and v and 8-bit to 16-bit conversion |
| h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b); |
| h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b); |
| v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b); |
| v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b); |
| |
| h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b); |
| v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b); |
| |
| h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); |
| v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); |
| |
| hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); |
| |
| a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4; |
| a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4; |
| |
| h_u = _mm_extract_epi16(hv_val_4x32b, 0); |
| h_v = _mm_extract_epi16(hv_val_4x32b, 2); |
| v_u = _mm_extract_epi16(hv_val_4x32b, 4); |
| v_v = _mm_extract_epi16(hv_val_4x32b, 6); |
| |
| h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2 |
| h_v = (h_v << 16) >> 15; |
| v_u = (v_u << 16) >> 15; |
| v_v = (v_v << 16) >> 15; |
| |
| b_u = ((h_u << 4) + h_u + 32) >> 6; |
| b_v = ((h_v << 4) + h_v + 32) >> 6; |
| c_u = ((v_u << 4) + v_u + 32) >> 6; |
| c_v = ((v_v << 4) + v_v + 32) >> 6; |
| } |
| //using a, b and c to compute the fitted plane values |
| { |
| __m128i const_8x16b, c2_8x16b; |
| __m128i res1_l_8x16b, res1_h_8x16b; |
| __m128i res2_l_8x16b, res2_h_8x16b; |
| __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b; |
| __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b; |
| |
| WORD32 b_u2, b_v2, b_u3, b_v3; |
| WORD32 const_u, const_v; |
| WORD32 dst_strd2; |
| |
| const_u = a_u - (c_u << 1) - c_u + 16; |
| const_v = a_v - (c_v << 1) - c_v + 16; |
| |
| b_u2 = b_u << 1; |
| b_v2 = b_v << 1; |
| b_u3 = b_u + b_u2; |
| b_v3 = b_v + b_v2; |
| |
| const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v); |
| res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0); |
| //contains {-b*3, -b*2, -b*1, b*0} |
| res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2); |
| //contains {b*1, b*2, b*3, b*4} |
| c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v); |
| |
| // rows 1, 2 |
| res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b); |
| res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b); |
| res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); |
| res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); |
| |
| res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); |
| res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); |
| res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); |
| res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); |
| |
| dst_strd2 = dst_strd << 1; |
| c2_8x16b = _mm_slli_epi16(c2_8x16b, 1); |
| |
| res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); |
| res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); |
| |
| _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); |
| |
| // rows 3, 4 |
| res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); |
| res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); |
| res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); |
| res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); |
| |
| res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); |
| res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); |
| res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); |
| res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); |
| |
| pu1_dst += dst_strd2; |
| |
| res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); |
| res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); |
| |
| _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); |
| |
| // rows 5, 6 |
| res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); |
| res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); |
| res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); |
| res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); |
| |
| res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); |
| res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); |
| res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); |
| res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); |
| |
| pu1_dst += dst_strd2; |
| |
| res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); |
| res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); |
| |
| _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); |
| |
| // rows 7, 8 |
| res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); |
| res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); |
| res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); |
| res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); |
| |
| res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); |
| res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); |
| res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); |
| res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); |
| |
| pu1_dst += dst_strd2; |
| |
| res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); |
| res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); |
| |
| _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); |
| _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); |
| |
| } |
| } |