Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 1 | /****************************************************************************** |
| 2 | * |
| 3 | * Copyright (C) 2015 The Android Open Source Project |
| 4 | * |
| 5 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | * you may not use this file except in compliance with the License. |
| 7 | * You may obtain a copy of the License at: |
| 8 | * |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | * |
| 11 | * Unless required by applicable law or agreed to in writing, software |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | * See the License for the specific language governing permissions and |
| 15 | * limitations under the License. |
| 16 | * |
| 17 | ***************************************************************************** |
| 18 | * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| 19 | */ |
| 20 | /** |
| 21 | ******************************************************************************* |
| 22 | * @file |
| 23 | * ih264_ihadamard_scaling_sse42.c |
| 24 | * |
| 25 | * @brief |
| 26 | * Contains definition of functions for h264 inverse hadamard 4x4 transform and scaling |
| 27 | * |
| 28 | * @author |
| 29 | * Mohit |
| 30 | * |
| 31 | * @par List of Functions: |
| 32 | * - ih264_ihadamard_scaling_4x4_sse42() |
| 33 | * - ih264_ihadamard_scaling_2x2_uv_ssse42() |
| 34 | * |
| 35 | * @remarks |
| 36 | * |
| 37 | ******************************************************************************* |
| 38 | */ |
| 39 | /*****************************************************************************/ |
| 40 | /* File Includes */ |
| 41 | /*****************************************************************************/ |
| 42 | |
| 43 | /* User include files */ |
| 44 | #include "ih264_typedefs.h" |
| 45 | #include "ih264_defs.h" |
| 46 | #include "ih264_trans_macros.h" |
| 47 | #include "ih264_macros.h" |
| 48 | #include "ih264_trans_data.h" |
| 49 | #include "ih264_size_defs.h" |
| 50 | #include "ih264_structs.h" |
| 51 | #include "ih264_trans_quant_itrans_iquant.h" |
| 52 | #include <immintrin.h> |
| 53 | |
| 54 | /* |
| 55 | ******************************************************************************** |
| 56 | * |
| 57 | * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients |
| 58 | * of a 16x16 intra prediction macroblock, and then performs scaling. |
| 59 | * prediction buffer |
| 60 | * |
| 61 | * @par Description: |
| 62 | * The DC coefficients pass through a 2-stage inverse hadamard transform. |
| 63 | * This inverse transformed content is scaled to based on Qp value. |
| 64 | * |
| 65 | * @param[in] pi2_src |
| 66 | * input 4x4 block of DC coefficients |
| 67 | * |
| 68 | * @param[out] pi2_out |
| 69 | * output 4x4 block |
| 70 | * |
| 71 | * @param[in] pu2_iscal_mat |
| 72 | * pointer to scaling list |
| 73 | * |
| 74 | * @param[in] pu2_weigh_mat |
| 75 | * pointer to weight matrix |
| 76 | * |
| 77 | * @param[in] u4_qp_div_6 |
| 78 | * Floor (qp/6) |
| 79 | * |
| 80 | * @param[in] pi4_tmp |
| 81 | * temporary buffer of size 1*16 |
| 82 | * |
| 83 | * @returns none |
| 84 | * |
| 85 | * @remarks none |
| 86 | * |
| 87 | ******************************************************************************* |
| 88 | */ |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 89 | void ih264_ihadamard_scaling_4x4_sse42(WORD16* pi2_src, |
| 90 | WORD16* pi2_out, |
| 91 | const UWORD16 *pu2_iscal_mat, |
| 92 | const UWORD16 *pu2_weigh_mat, |
| 93 | UWORD32 u4_qp_div_6, |
| 94 | WORD32* pi4_tmp) |
| 95 | { |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 96 | __m128i src_r0_r1, src_r2_r3; |
| 97 | __m128i src_r0, src_r1, src_r2, src_r3; |
| 98 | __m128i temp0, temp1, temp2, temp3; |
Ritu Baldwa | 4be8f62 | 2018-03-22 16:37:28 +0530 | [diff] [blame] | 99 | __m128i add_rshift = _mm_set1_epi32((u4_qp_div_6 < 6) ? (1 << (5 - u4_qp_div_6)) : 0); |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 100 | __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]); |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 101 | UNUSED (pi4_tmp); |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 102 | |
| 103 | src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row |
| 104 | src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row |
| 105 | //sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1); |
| 106 | src_r0 = _mm_cvtepi16_epi32(src_r0_r1); |
| 107 | src_r0_r1 = _mm_srli_si128(src_r0_r1, 8); |
| 108 | src_r1 = _mm_cvtepi16_epi32(src_r0_r1); |
| 109 | |
| 110 | src_r2 = _mm_cvtepi16_epi32(src_r2_r3); |
| 111 | src_r2_r3 = _mm_srli_si128(src_r2_r3, 8); |
| 112 | src_r3 = _mm_cvtepi16_epi32(src_r2_r3); |
| 113 | |
| 114 | /* Perform Inverse transform */ |
| 115 | /*-------------------------------------------------------------*/ |
| 116 | /* IDCT [ Horizontal transformation ] */ |
| 117 | /*-------------------------------------------------------------*/ |
| 118 | // Matrix transpose |
| 119 | /* |
| 120 | * a0 a1 a2 a3 |
| 121 | * b0 b1 b2 b3 |
| 122 | * c0 c1 c2 c3 |
| 123 | * d0 d1 d2 d3 |
| 124 | */ |
| 125 | temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1 |
| 126 | temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1 |
| 127 | temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3 |
| 128 | temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3 |
| 129 | src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0 |
| 130 | src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1 |
| 131 | src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2 |
| 132 | src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3 |
| 133 | |
| 134 | temp0 = _mm_add_epi32(src_r0, src_r3); |
| 135 | temp1 = _mm_add_epi32(src_r1, src_r2); |
| 136 | temp2 = _mm_sub_epi32(src_r1, src_r2); |
| 137 | temp3 = _mm_sub_epi32(src_r0, src_r3); |
| 138 | |
| 139 | src_r0 = _mm_add_epi32(temp0, temp1); |
| 140 | src_r1 = _mm_add_epi32(temp2, temp3); |
| 141 | src_r2 = _mm_sub_epi32(temp0, temp1); |
| 142 | src_r3 = _mm_sub_epi32(temp3, temp2); |
| 143 | |
| 144 | /*-------------------------------------------------------------*/ |
| 145 | /* IDCT [ Vertical transformation ] */ |
| 146 | /*-------------------------------------------------------------*/ |
| 147 | // Matrix transpose |
| 148 | /* |
| 149 | * a0 b0 c0 d0 |
| 150 | * a1 b1 c1 d1 |
| 151 | * a2 b2 c2 d2 |
| 152 | * a3 b3 c3 d3 |
| 153 | */ |
| 154 | temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1 |
| 155 | temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3 |
| 156 | temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1 |
| 157 | temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3 |
| 158 | src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3 |
| 159 | src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3 |
| 160 | src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3 |
| 161 | src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3 |
| 162 | |
| 163 | temp0 = _mm_add_epi32(src_r0, src_r3); |
| 164 | temp1 = _mm_add_epi32(src_r1, src_r2); |
| 165 | temp2 = _mm_sub_epi32(src_r1, src_r2); |
| 166 | temp3 = _mm_sub_epi32(src_r0, src_r3); |
| 167 | |
| 168 | src_r0 = _mm_add_epi32(temp0, temp1); |
| 169 | src_r1 = _mm_add_epi32(temp2, temp3); |
| 170 | src_r2 = _mm_sub_epi32(temp0, temp1); |
| 171 | src_r3 = _mm_sub_epi32(temp3, temp2); |
| 172 | |
| 173 | src_r0 = _mm_mullo_epi32(src_r0, mult_val); |
| 174 | src_r1 = _mm_mullo_epi32(src_r1, mult_val); |
| 175 | src_r2 = _mm_mullo_epi32(src_r2, mult_val); |
| 176 | src_r3 = _mm_mullo_epi32(src_r3, mult_val); |
| 177 | |
| 178 | //Scaling |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 179 | if(u4_qp_div_6 >= 6) |
| 180 | { |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 181 | src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6); |
| 182 | src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6); |
| 183 | src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6); |
| 184 | src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6); |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 185 | } |
| 186 | else |
| 187 | { |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 188 | temp0 = _mm_add_epi32(src_r0, add_rshift); |
| 189 | temp1 = _mm_add_epi32(src_r1, add_rshift); |
| 190 | temp2 = _mm_add_epi32(src_r2, add_rshift); |
| 191 | temp3 = _mm_add_epi32(src_r3, add_rshift); |
| 192 | src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6); |
| 193 | src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6); |
| 194 | src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6); |
| 195 | src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6); |
| 196 | } |
| 197 | src_r0_r1 = _mm_packs_epi32(src_r0, src_r1); |
| 198 | src_r2_r3 = _mm_packs_epi32(src_r2, src_r3); |
| 199 | |
| 200 | _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1); |
| 201 | _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3); |
| 202 | } |
| 203 | |
| 204 | void ih264_ihadamard_scaling_2x2_uv_sse42(WORD16* pi2_src, |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 205 | WORD16* pi2_out, |
| 206 | const UWORD16 *pu2_iscal_mat, |
| 207 | const UWORD16 *pu2_weigh_mat, |
| 208 | UWORD32 u4_qp_div_6, |
| 209 | WORD32* pi4_tmp) |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 210 | { |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 211 | __m128i src, plane_0, plane_1, temp0, temp1, sign_reg; |
| 212 | __m128i zero_8x16b = _mm_setzero_si128(); |
| 213 | __m128i scale_val = _mm_set1_epi32((WORD32)(pu2_iscal_mat[0] * pu2_weigh_mat[0])); |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame] | 214 | UNUSED(pi4_tmp); |
| 215 | |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 216 | src = _mm_loadu_si128((__m128i *) pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3 |
| 217 | sign_reg = _mm_cmpgt_epi16(zero_8x16b, src); |
| 218 | plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits |
| 219 | plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits |
| 220 | |
| 221 | temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3 |
| 222 | temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3 |
| 223 | plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3 |
| 224 | plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3 |
| 225 | temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3 |
| 226 | temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3 |
| 227 | |
| 228 | plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3 |
| 229 | plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3 |
| 230 | |
| 231 | plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3 |
| 232 | plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3 |
| 233 | |
| 234 | temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] |
| 235 | temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_iscal_mat[0] * pu2_weigh_mat[0] |
| 236 | |
| 237 | temp0 = _mm_slli_epi32(temp0, u4_qp_div_6); |
| 238 | temp1 = _mm_slli_epi32(temp1, u4_qp_div_6); |
| 239 | |
| 240 | temp0 = _mm_srai_epi32(temp0, 5); |
| 241 | temp1 = _mm_srai_epi32(temp1, 5); |
| 242 | |
| 243 | temp0 = _mm_packs_epi32(temp0, temp1); //Final values are 16-bits only. |
| 244 | |
| 245 | _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0); |
| 246 | |
| 247 | } |