Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 1 | @/****************************************************************************** |
| 2 | @ * |
| 3 | @ * Copyright (C) 2015 The Android Open Source Project |
| 4 | @ * |
| 5 | @ * Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | @ * you may not use this file except in compliance with the License. |
| 7 | @ * You may obtain a copy of the License at: |
| 8 | @ * |
| 9 | @ * http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | @ * |
| 11 | @ * Unless required by applicable law or agreed to in writing, software |
| 12 | @ * distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | @ * See the License for the specific language governing permissions and |
| 15 | @ * limitations under the License. |
| 16 | @ * |
| 17 | @ ***************************************************************************** |
| 18 | @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore |
| 19 | @*/ |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 20 | @** |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 21 | @****************************************************************************** |
| 22 | @* @file |
| 23 | @* ih264_inter_pred_luma_horz_qpel_a9q.s |
| 24 | @* |
| 25 | @* @brief |
| 26 | @* Contains function definitions for inter prediction horizontal quarter pel interpolation. |
| 27 | @* |
| 28 | @* @author |
| 29 | @* Mohit |
| 30 | @* |
| 31 | @* @par List of Functions: |
| 32 | @* |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 33 | @* - ih264_inter_pred_luma_horz_qpel_a9q() |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 34 | @* |
| 35 | @* @remarks |
| 36 | @* None |
| 37 | @* |
| 38 | @******************************************************************************* |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 39 | @* |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 40 | |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 41 | @* All the functions here are replicated from ih264_inter_pred_filters.c |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 42 | @ |
| 43 | |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 44 | @** |
| 45 | @** |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 46 | @******************************************************************************* |
| 47 | @* |
| 48 | @* @brief |
| 49 | @* Quarter pel interprediction luma filter for horizontal input |
| 50 | @* |
| 51 | @* @par Description: |
| 52 | @* Applies a 6 tap horizontal filter .The output is clipped to 8 bits |
| 53 | @* sec 8.4.2.2.1 titled "Luma sample interpolation process" |
| 54 | @* |
| 55 | @* @param[in] pu1_src |
| 56 | @* UWORD8 pointer to the source |
| 57 | @* |
| 58 | @* @param[out] pu1_dst |
| 59 | @* UWORD8 pointer to the destination |
| 60 | @* |
| 61 | @* @param[in] src_strd |
| 62 | @* integer source stride |
| 63 | @* |
| 64 | @* @param[in] dst_strd |
| 65 | @* integer destination stride |
| 66 | @* |
| 67 | @* @param[in] ht |
| 68 | @* integer height of the array |
| 69 | @* |
| 70 | @* @param[in] wd |
| 71 | @* integer width of the array |
| 72 | @* |
| 73 | @ @param[in] pu1_tmp: temporary buffer: UNUSED in this function |
| 74 | @* |
| 75 | @* @param[in] dydx: x and y reference offset for qpel calculations. |
| 76 | @* @returns |
| 77 | @* |
| 78 | @ @remarks |
| 79 | @* None |
| 80 | @* |
| 81 | @******************************************************************************* |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 82 | @* |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 83 | |
| 84 | @void ih264_inter_pred_luma_horz ( |
| 85 | @ UWORD8 *pu1_src, |
| 86 | @ UWORD8 *pu1_dst, |
| 87 | @ WORD32 src_strd, |
| 88 | @ WORD32 dst_strd, |
| 89 | @ WORD32 ht, |
| 90 | @ WORD32 wd, |
| 91 | @ UWORD8* pu1_tmp, |
| 92 | @ UWORD32 dydx) |
| 93 | |
| 94 | @**************Variables Vs Registers***************************************** |
| 95 | @ r0 => *pu1_src |
| 96 | @ r1 => *pu1_dst |
| 97 | @ r2 => src_strd |
| 98 | @ r3 => dst_strd |
| 99 | @ r5 => ht |
| 100 | @ r6 => wd |
| 101 | @ r7 => dydx |
| 102 | |
| 103 | .text |
| 104 | .p2align 2 |
| 105 | |
| 106 | |
| 107 | .global ih264_inter_pred_luma_horz_qpel_a9q |
| 108 | |
| 109 | ih264_inter_pred_luma_horz_qpel_a9q: |
| 110 | |
| 111 | stmfd sp!, {r4-r12, r14} @store register values to stack |
| 112 | vstmdb sp!, {d8-d15} @push neon registers to stack |
| 113 | ldr r5, [sp, #104] @Loads ht |
| 114 | ldr r6, [sp, #108] @Loads wd |
| 115 | ldr r7, [sp, #116] @Loads dydx |
| 116 | and r7, r7, #3 @Finds x-offset |
| 117 | add r7, r0, r7, lsr #1 @pu1_src + (x_offset>>1) |
| 118 | sub r0, r0, #2 @pu1_src-2 |
| 119 | vmov.i8 d0, #5 @filter coeff |
| 120 | subs r12, r6, #8 @if wd=8 branch to loop_8 |
| 121 | vmov.i8 d1, #20 @filter coeff |
| 122 | |
| 123 | beq loop_8 |
| 124 | |
| 125 | subs r12, r6, #4 @if wd=4 branch to loop_4 |
| 126 | beq loop_4 |
| 127 | |
| 128 | loop_16: @when wd=16 |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 129 | @ Processing row0 and row1 |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 130 | vld1.8 {d2, d3, d4}, [r0], r2 @// Load row0 |
| 131 | vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) |
| 132 | vld1.8 {d5, d6, d7}, [r0], r2 @// Load row1 |
| 133 | vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row0) |
| 134 | vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) |
| 135 | vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) |
| 136 | vaddl.u8 q5, d30, d3 @// a0 + a5 (column2,row0) |
| 137 | vext.8 d27, d6, d7, #5 @//extract a[5] (column2,row1) |
| 138 | vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) |
| 139 | vext.8 d31, d2, d3, #2 @//extract a[2] (column1,row0) |
| 140 | vaddl.u8 q8, d27, d6 @// a0 + a5 (column2,row1) |
| 141 | vext.8 d30, d3, d4, #2 @//extract a[2] (column2,row0) |
| 142 | vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 (column1,row0) |
| 143 | vext.8 d28, d5, d6, #2 @//extract a[2] (column1,row1) |
| 144 | vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 (column2,row0) |
| 145 | vext.8 d27, d6, d7, #2 @//extract a[2] (column2,row1) |
| 146 | vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 (column1,row1) |
| 147 | vext.8 d31, d2, d3, #3 @//extract a[3] (column1,row0) |
| 148 | vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 (column2,row1) |
| 149 | vext.8 d30, d3, d4, #3 @//extract a[3] (column2,row0) |
| 150 | vmlal.u8 q4, d31, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| 151 | vext.8 d28, d5, d6, #3 @//extract a[3] (column1,row1) |
| 152 | vmlal.u8 q5, d30, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row0) |
| 153 | vext.8 d27, d6, d7, #3 @//extract a[3] (column2,row1) |
| 154 | vmlal.u8 q7, d28, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| 155 | vext.8 d31, d2, d3, #1 @//extract a[1] (column1,row0) |
| 156 | vmlal.u8 q8, d27, d1 @// a0 + a5 + 20a2 + 20a3 (column2,row1) |
| 157 | vext.8 d30, d3, d4, #1 @//extract a[1] (column2,row0) |
| 158 | vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| 159 | vext.8 d28, d5, d6, #1 @//extract a[1] (column1,row1) |
| 160 | vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row0) |
| 161 | vext.8 d27, d6, d7, #1 @//extract a[1] (column2,row1) |
| 162 | vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| 163 | vext.8 d31, d2, d3, #4 @//extract a[4] (column1,row0) |
| 164 | vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column2,row1) |
| 165 | vext.8 d30, d3, d4, #4 @//extract a[4] (column2,row0) |
| 166 | vmlsl.u8 q4, d31, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| 167 | vext.8 d28, d5, d6, #4 @//extract a[4] (column1,row1) |
| 168 | vmlsl.u8 q5, d30, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row0) |
| 169 | vext.8 d27, d6, d7, #4 @//extract a[4] (column2,row1) |
| 170 | vmlsl.u8 q7, d28, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| 171 | vmlsl.u8 q8, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column2,row1) |
| 172 | vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row0) |
| 173 | vqrshrun.s16 d20, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| 174 | vqrshrun.s16 d21, q5, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row0) |
| 175 | vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row2) |
| 176 | vrhadd.u8 q10, q6, q10 @Interpolation step for qpel calculation |
| 177 | vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| 178 | vst1.8 {d20, d21}, [r1], r3 @//Store dest row0 |
| 179 | vext.8 d30, d3, d4, #5 @//extract a[5] (column2,row2) |
| 180 | vqrshrun.s16 d19, q8, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column2,row1) |
| 181 | vld1.32 {d12, d13}, [r7], r2 @Load value for interpolation (column1,row1) |
| 182 | vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation |
| 183 | vst1.8 {d18, d19}, [r1], r3 @//Store dest row1 |
| 184 | subs r5, r5, #2 @ 2 rows done, decrement by 2 |
| 185 | |
| 186 | beq end_func |
| 187 | b loop_16 |
| 188 | |
| 189 | loop_8: |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 190 | @ Processing row0 and row1 |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 191 | |
| 192 | vld1.8 {d5, d6}, [r0], r2 @// Load row1 |
| 193 | vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) |
| 194 | vld1.8 {d2, d3}, [r0], r2 @// Load row0 |
| 195 | vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) |
| 196 | vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) |
| 197 | vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) |
| 198 | vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) |
| 199 | vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) |
| 200 | vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) |
| 201 | vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) |
| 202 | vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) |
| 203 | vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| 204 | vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| 205 | vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| 206 | vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) |
| 207 | vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) |
| 208 | vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) |
| 209 | vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) |
| 210 | vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| 211 | vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) |
| 212 | vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| 213 | vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| 214 | vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| 215 | vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) |
| 216 | vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) |
| 217 | vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| 218 | vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation |
| 219 | vst1.8 {d18}, [r1], r3 @//Store dest row0 |
| 220 | vst1.8 {d19}, [r1], r3 @//Store dest row1 |
| 221 | subs r5, r5, #2 @ 2 rows done, decrement by 2 |
| 222 | |
| 223 | beq end_func @ Branch if height==4 |
Harish Mahendrakar | 7497191 | 2015-04-20 15:33:05 +0530 | [diff] [blame^] | 224 | b loop_8 @looping if height == 8 or 16 |
Hamsalekha S | 8d3d303 | 2015-03-13 21:24:58 +0530 | [diff] [blame] | 225 | |
| 226 | loop_4: |
| 227 | vld1.8 {d5, d6}, [r0], r2 @// Load row1 |
| 228 | vext.8 d28, d5, d6, #5 @//extract a[5] (column1,row1) |
| 229 | vld1.8 {d2, d3}, [r0], r2 @// Load row0 |
| 230 | vext.8 d25, d5, d6, #2 @//extract a[2] (column1,row1) |
| 231 | vext.8 d31, d2, d3, #5 @//extract a[5] (column1,row0) |
| 232 | vaddl.u8 q7, d28, d5 @// a0 + a5 (column1,row1) |
| 233 | vext.8 d24, d5, d6, #3 @//extract a[3] (column1,row1) |
| 234 | vext.8 d23, d5, d6, #1 @//extract a[1] (column1,row1) |
| 235 | vext.8 d22, d5, d6, #4 @//extract a[4] (column1,row1) |
| 236 | vext.8 d29, d2, d3, #3 @//extract a[3] (column1,row0) |
| 237 | vmlal.u8 q7, d25, d1 @// a0 + a5 + 20a2 (column1,row1) |
| 238 | vmlal.u8 q7, d24, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row1) |
| 239 | vmlsl.u8 q7, d23, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row1) |
| 240 | vmlsl.u8 q7, d22, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row1) |
| 241 | vaddl.u8 q4, d31, d2 @// a0 + a5 (column1,row0) |
| 242 | vext.8 d30, d2, d3, #2 @//extract a[2] (column1,row0) |
| 243 | vld1.32 d12, [r7], r2 @Load value for interpolation (column1,row0) |
| 244 | vld1.32 d13, [r7], r2 @Load value for interpolation (column1,row1) |
| 245 | vext.8 d27, d2, d3, #1 @//extract a[1] (column1,row0) |
| 246 | vext.8 d26, d2, d3, #4 @//extract a[4] (column1,row0) |
| 247 | vmlal.u8 q4, d29, d1 @// a0 + a5 + 20a2 + 20a3 (column1,row0) |
| 248 | vmlal.u8 q4, d30, d1 @// a0 + a5 + 20a2 (column1,row0) |
| 249 | vmlsl.u8 q4, d27, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 (column1,row0) |
| 250 | vmlsl.u8 q4, d26, d0 @// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 (column1,row0) |
| 251 | vqrshrun.s16 d18, q7, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row0) |
| 252 | vqrshrun.s16 d19, q4, #5 @// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5 (column1,row1) |
| 253 | vrhadd.u8 q9, q6, q9 @Interpolation step for qpel calculation |
| 254 | vst1.32 d18[0], [r1], r3 @//Store dest row0 |
| 255 | vst1.32 d19[0], [r1], r3 @//Store dest row1 |
| 256 | |
| 257 | subs r5, r5, #2 @ 2 rows done, decrement by 2 |
| 258 | beq end_func |
| 259 | |
| 260 | b loop_4 |
| 261 | |
| 262 | end_func: |
| 263 | vldmia sp!, {d8-d15} @ Restore neon registers that were saved |
| 264 | ldmfd sp!, {r4-r12, pc} @Restoring registers from stack |
| 265 | |
| 266 | |